-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
75 lines (54 loc) · 1.88 KB
/
main.py
File metadata and controls
75 lines (54 loc) · 1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
from collections import Counter
from pprint import pprint
import requests
from bs4 import BeautifulSoup
def is_valid(word):
return "[" not in word and "]" not in word
def extract_lyrics(url):
print(f"Fetching lyrics {url}...")
r = requests.get(url)
if r.status_code != 200:
print("Page impossible a recuperer.")
return []
soup = BeautifulSoup(r.content, 'html.parser')
lyrics = soup.find("div", class_="lyrics")
if not lyrics:
return extract_lyrics(url)
if "released" in lyrics.stripped_strings:
return []
all_words = []
for sentence in lyrics.stripped_strings:
sentence_words = [word.strip(",").strip(".").lower() for word in sentence.split() if is_valid(word)]
all_words.extend(sentence_words)
return all_words
def get_all_urls():
page_number = 1
links = []
while True:
r = requests.get(f"https://genius.com/api/artists/29743/songs?page={page_number}&sort=popularity")
if r.status_code == 200:
print(f"Fetching page {page_number}")
response = r.json().get("response", {})
next_page = response.get("next_page")
songs = response.get("songs")
links.extend([song.get("url") for song in songs])
page_number += 1
if not next_page:
print("No more pages to fetch.")
break
return links
def get_all_words():
urls = get_all_urls()
words = []
for url in urls:
lyrics = extract_lyrics(url=url)
words.extend(lyrics)
# with open("data-michel-sardou.json", "w") as f:
# json.dump(words, f, indent=4)
# with open("data.json", "r") as f:
# words = json.load(f)
counter = Counter([w for w in words if len(w) > 5])
most_common_words = counter.most_common(15)
pprint(most_common_words)
get_all_words()