CodeFun/2.py at main · DontAskMeWhy0507/CodeFun · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import requests
from bs4 import BeautifulSoup
import json
import re

## Scrape movie data from Rophim.mx

# Get the movie listing page
response = requests.get("https://www.rophim.mx/phimhay")
soup = BeautifulSoup(response.text, "html.parser")

# Save HTML for inspection
with open("phimhay_page.html", "w", encoding="utf-8") as f:
    f.write(response.text)

## Export to html file
with open("rophim.html", "w", encoding="utf-8") as f:
    f.write(response.text)


# Initialize organized data structure
organized_data = {
    "movies": [],
    "site_info": {
        "name": "RoPhim",
        "url": "https://www.rophim.mx",
        "description": "Phim hay cả rổ - Xem phim HD online miễn phí"
    }
}

# Find all movie thumbnails with class "v-thumbnail" or any anchor with /phim/ in href
movie_links = soup.find_all("a", class_="v-thumbnail")

# If no v-thumbnail found, try alternative patterns
if not movie_links:
    print("No v-thumbnail found, trying alternative patterns...")
    movie_links = soup.find_all("a", href=re.compile(r"/phim/"))

print(f"Found {len(movie_links)} movie links")

for movie_link in movie_links:
    movie_data = {}

    # Get movie link and ID
    href = movie_link.get("href", "")
    movie_data["link"] = href
    movie_data["full_url"] = f"https://www.rophim.mx{href}" if href.startswith("/") else href

    # Extract movie slug from URL (e.g., "thu-thach-than-tuong.4FmPc8om")
    if href:
        slug = href.split("/")[-1]
        movie_data["slug"] = slug

    # Get image (poster)
    img = movie_link.find("img")
    if img:
        movie_data["poster"] = img.get("src", "")
        movie_data["title"] = img.get("alt", "").replace("Xem Phim ", "").replace(" Vietsub HD Online - Rophim", "")

    # Get episode count (pin-new)
    pin_new = movie_link.find("div", class_="pin-new")
    if pin_new:
        episode_count = pin_new.find("strong")
        if episode_count:
            movie_data["episodes"] = episode_count.get_text(strip=True)

    # Add to list if we have at least a title
    if movie_data.get("title"):
        organized_data["movies"].append(movie_data)

organized_data["total_movies"] = len(organized_data["movies"])

# Extract meta information
meta_description = soup.find("meta", {"name": "description"})
if meta_description:
    organized_data["site_info"]["description"] = meta_description.get("content", "")

# Try to find movie/content data in script tags
scripts = soup.find_all("script")
for script in scripts:
    script_content = script.string
    if script_content:
        # Look for movie data patterns in __next_f.push or other JSON structures
        if "self.__next_f.push" in script_content:
            matches = re.findall(r'self\.__next_f\.push\((\[.*?\])\)', script_content, re.DOTALL)
            for match in matches:
                try:
                    data = json.loads(match)
                    # Check if this contains movie information
                    if isinstance(data, list) and len(data) > 1:
                        # Try to extract useful information
                        content = str(data)
                        # Look for potential movie titles, links, etc.
                        if any(keyword in content for keyword in ["phim", "movie", "title", "href"]):
                            organized_data["movies"].append({
                                "raw_data": data,
                                "type": "next_data"
                            })
                except json.JSONDecodeError:
                    pass

# Look for links to movies
movie_links = []
for link in soup.find_all("a", href=True):
    href = link.get("href", "")
    title = link.get_text(strip=True)

    # Filter for movie-like links
    if href and title and not href.startswith("#"):
        # Avoid common navigation links
        if href.startswith("/") and title not in ["Hỏi-Đáp", "Chính sách bảo mật", "Điều khoản sử dụng", "Giới thiệu", "Liên hệ"]:
            movie_links.append({
                "title": title,
                "link": href,
                "full_url": f"https://www.rophim.mx{href}" if href.startswith("/") else href
            })

if movie_links:
    organized_data["discovered_links"] = movie_links

# Extract structured content from the page
# Look for movie cards, sections, etc.
movie_sections = soup.find_all("div", class_=re.compile(r"movie|film|card|item"))
for section in movie_sections:
    movie_info = {}

    # Try to find title
    title_elem = section.find(["h1", "h2", "h3", "h4", "a"])
    if title_elem:
        movie_info["title"] = title_elem.get_text(strip=True)

    # Try to find link
    link_elem = section.find("a", href=True)
    if link_elem:
        movie_info["link"] = link_elem.get("href")

    # Try to find image
    img_elem = section.find("img")
    if img_elem:
        movie_info["image"] = img_elem.get("src") or img_elem.get("data-src")

    # Try to find description
    desc_elem = section.find("p")
    if desc_elem:
        movie_info["description"] = desc_elem.get_text(strip=True)

    if movie_info:
        organized_data["movies"].append(movie_info)

# Save organized data
with open("extracted_data.json", "w", encoding="utf-8") as f:
    json.dump(organized_data, f, ensure_ascii=False, indent=2)

print(f"\n✅ Extracted {organized_data['total_movies']} movies from RoPhim")
print(f"📁 Saved to extracted_data.json")

# Print first 3 movies as sample
if organized_data["movies"]:
    print(f"\n📽️  Sample movies:")
    for i, movie in enumerate(organized_data["movies"][:3], 1):
        print(f"\n{i}. {movie.get('title', 'N/A')}")
        print(f"   URL: {movie.get('full_url', 'N/A')}")
        print(f"   Episodes: {movie.get('episodes', 'N/A')}")
        print(f"   Poster: {movie.get('poster', 'N/A')[:60]}...")