-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWikiDeath.py
More file actions
103 lines (86 loc) · 2.77 KB
/
WikiDeath.py
File metadata and controls
103 lines (86 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import requests
import json
import wikipediaapi
from tqdm.auto import tqdm
# Initialize Wikipedia API
wiki_wiki = wikipediaapi.Wikipedia("MyProjectName (merlin@example.com)", "en")
# Get the page for "Lists of deaths by year"
Lists_of_deaths_by_year = wiki_wiki.page("Lists_of_deaths_by_year")
list_years = []
url_list = []
Lists_of_deaths_by_year = Lists_of_deaths_by_year.text.split("\n")
months = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
deadge = {}
missing_list = []
def main():
list_years = [x for x in range(1992, 2022)]
create_page_url(list_years)
scrap_wikipage(list_years, url_list)
write_wikidead(deadge)
def create_page_url(list_years):
"""
Create Wikipedia page URLs for each month and year.
Parameters:
- list_years: List of years
Returns:
- url_list: List of Wikipedia page URLs
"""
for year in list_years:
for month in months:
url = "Deaths_in_{0}_{1}".format(month, year)
url_list.append(url)
return url_list
def scrap_wikipage(list_years, url_list):
"""
Scrape Wikipedia pages for death records.
Parameters:
- list_years: List of years
- url_list: List of Wikipedia page URLs
Returns:
- deadge: Dictionary containing death records
"""
for year in tqdm(list_years):
deadge[str(year)] = {}
for month in months:
url = "Deaths_in_{0}_{1}".format(month, year)
deadge[str(year)][str(month)] = {}
for day in range(1, 32):
deadge[str(year)][str(month)][str(day)] = {}
page_py = wiki_wiki.page(url_list[0]).section_by_title(str(day))
death_day = page_py.text.strip("\n\n\n== References ==").split("\n")
for record in death_day:
death_pp = record.split(",")
if len(death_pp) == 3 and death_pp is not None:
deadge[str(year)][str(month)][str(day)] = {
"Age": death_pp[1],
"Role": death_pp[2],
}
if len(death_pp) == 4 and death_pp is not None:
deadge[str(year)][str(month)][str(day)] = {
"Age": death_pp[1],
"Role": death_pp[2],
"cause": death_pp[3],
}
return deadge
def write_wikidead(deadge):
"""
Write death records to a JSON file.
Parameters:
- deadge: Dictionary containing death records
"""
with open("c:\\Users\\Filippo\\Desktop\\Python\\Data\\Dead.json", "w") as f:
json.dump(deadge, f, indent=3)
main()