kwork-parser/kwork_detail.py at main · collertop/kwork-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
import html as html_module
import time
import logging
import os
import requests

logger = logging.getLogger(__name__)

EMPTY = {
    "description_length": None,
    "title_length": None,
    "has_video": None,
    "images_count": None,
    "has_portfolio": None,
    "portfolio_count": None,
    "has_faq": None,
    "has_tiers": None,
    "price_standard": None,
    "price_premium": None,
    "delivery_days_base": None,
    "tags": None,
    "repeat_orders_pct": None,
}


def fetch_kwork_detail(kwork_url: str, cookie: str) -> dict:
    if not kwork_url:
        return EMPTY.copy()

    full_url = f"https://kwork.ru{kwork_url}"
    headers = {
        "Cookie": cookie,
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }

    try:
        resp = requests.get(full_url, headers=headers, timeout=15)
        if resp.status_code != 200:
            logger.warning(f"HTTP {resp.status_code} для {full_url}")
            return EMPTY.copy()

        match = re.search(
            r'window\.stateData\s*=\s*(\{.+?\});\s*window\.',
            resp.text, re.DOTALL
        )
        if not match:
            logger.warning(f"stateData не найден: {full_url}")
            return EMPTY.copy()

        import json
        state = json.loads(match.group(1))
        kwork = state.get("kwork", {})
        result = {}

        # Описание — декодируем HTML entities, убираем теги
        gdesc = kwork.get("gdesc_source", "") or kwork.get("gdesc", "") or ""
        gdesc = html_module.unescape(gdesc)
        clean_desc = re.sub(r'<[^>]+>', '', gdesc).strip()
        result["description_length"] = len(clean_desc)

        # Заголовок
        gtitle = kwork.get("gtitle", "") or ""
        result["title_length"] = len(gtitle)

        # Видео
        result["has_video"] = bool(
            re.search(r'youtube|vimeo|<video', resp.text, re.I)
        )

        # Изображения
        first_pic = state.get("firstPicture", {}) or {}
        result["images_count"] = 1 if first_pic.get("src") else 0

        # Портфолио кворка (не продавца)
        kwork_portfolio = state.get("kworkPortfolio", []) or []
        result["has_portfolio"] = len(kwork_portfolio) > 0
        result["portfolio_count"] = len(kwork_portfolio)

        # FAQ
        faq = state.get("kworkFaq", []) or []
        result["has_faq"] = len(faq) > 0

        # Тарифы
        packages = kwork.get("packages", []) or []
        is_package = kwork.get("isPackage", False)
        result["has_tiers"] = bool(is_package and len(packages) > 0)
        result["price_standard"] = None
        result["price_premium"] = None
        if packages:
            prices = sorted(
                [p.get("price", 0) for p in packages if isinstance(p, dict) and p.get("price")]
            )
            if len(prices) >= 1:
                result["price_standard"] = float(prices[0])
            if len(prices) >= 2:
                result["price_premium"] = float(prices[-1])

        # Срок из kwork.days
        days = kwork.get("days")
        try:
            result["delivery_days_base"] = int(days) if days is not None else None
        except (ValueError, TypeError):
            result["delivery_days_base"] = None

        # Теги из classifications
        tags = []
        for cl in kwork.get("classifications", []) or []:
            if not isinstance(cl, dict):
                continue
            for child in cl.get("children", []) or []:
                if not isinstance(child, dict):
                    continue
                if child.get("title"):
                    tags.append(child["title"])
        result["tags"] = tags

        # Повторные заказы — недоступно через stateData
        result["repeat_orders_pct"] = None

        return result

    except Exception as e:
        logger.error(f"Ошибка при парсинге {full_url}: {e}")
        return EMPTY.copy()