-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.py
More file actions
83 lines (68 loc) · 2.58 KB
/
parser.py
File metadata and controls
83 lines (68 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from datetime import datetime
from bs4.element import Tag
from typing_extensions import List
from bs4 import BeautifulSoup as bs
import requests as r
from pydantic import BaseModel
import re
NUMBER_OF_GROUPS = 18
sessionURL = "https://mai.ru/education/studies/schedule/session/index.php?group=М8О-1{}БВ-25"
def groupUrl(groupNumber: int) -> str:
group: str = ""
if groupNumber < 10:
group = "0{}".format(groupNumber);
else:
group = str(groupNumber)
return sessionURL.format(group)
def tag_text(tag: Tag):
return re.sub(r"\s{1,}", " ", tag.text).strip()
class Subject(BaseModel):
groupNumber: int
date: str
time: str
name: str
lector: str
auditory: str
def parseGroupShedule(session: r.Session, groupNumber: int) -> List[Subject]:
url = groupUrl(groupNumber)
response = session.get(url)
if response.status_code != 200:
print(f"[ERROR] can't GET group: {groupNumber}, url: {url}")
return []
subjects = []
soup = bs(response.text, "html.parser")
subjItems = soup.find_all("li", class_="step-item")
for item in subjItems:
date = item.find("span", class_="step-title text-body my-2 py-1")
subj = item.find("div", class_="d-flex align-items-center justify-content-between")
lower_line = item.find_all("li", class_="list-inline-item")
time = "-"
lector = "-"
auditory = "-"
if date is None:
print("Item: ", item, "have no date field!")
continue
if subj is None:
print("Item: ", item, "have no subj field!")
continue
if len(lower_line) != 3:
print("Item: ", item, f" waited to have 3 items in lower line, but have only: {len(lower_line)}")
if len(lower_line) > 0:
time = tag_text(lower_line[0])
if len(lower_line) > 1:
lector = tag_text(lower_line[1])
if len(lower_line) > 2:
auditory = tag_text(lower_line[2])
date = tag_text(date)
subj = tag_text(subj)
subjects.append(Subject(date=date, time=time, name=subj, lector=lector, auditory=auditory, groupNumber=groupNumber))
return subjects
session = r.Session()
subjects: List[Subject] = []
for i in range(1, NUMBER_OF_GROUPS + 1):
subjects.extend(parseGroupShedule(session, i))
print("Group: {} parsed!".format(i))
with open("session.csv", "w") as file:
file.write("date,time,groupNumber,subject,lector,auditory\n")
for s in subjects:
file.write(f"{s.date},{s.time},{s.groupNumber},{s.name},{s.lector},{s.auditory}\n")