diff --git a/Scraping.py b/Scraping.py new file mode 100644 index 0000000..e911c03 --- /dev/null +++ b/Scraping.py @@ -0,0 +1,103 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import os +from email_sending import * +from database import * +from sqlalchemy.orm import sessionmaker +from sqlalchemy.inspection import inspect + +session = sessionmaker(bind=engine)() +new_aca_lesson_list = [] + + +def info_for_every_lesson(url): + with urlopen(url) as resp: + data = resp.read() + + dict_for_info = {} + + soup = BeautifulSoup(data, 'html.parser') + div = soup.find("div", {"id": "header"}) + dict_for_info["course_name"] = div.h1.string + + dict_for_info["course_id"] = (os.path.basename(url)).split(".")[0] + dict_for_info["course_url"] = url + + tab = soup.find("table", attrs={"class": "table"}) + string_in_tab = tuple(tab.stripped_strings) + if 'Price:' in string_in_tab or 'Tuition fee:' in string_in_tab: + if 'Price:' in string_in_tab: + idx = string_in_tab.index('Price:') + else: + idx = string_in_tab.index('Tuition fee:') + dict_for_info["price"] = string_in_tab[idx + 1].strip("*") + if 'Level:' in string_in_tab: + idx_1 = string_in_tab.index('Level:') + dict_for_info["level"] = string_in_tab[idx_1 + 1] + + div_1 = soup.find("div", attrs={"id": "tutors"}) + tutors_comp = list(div_1.stripped_strings) + + existing_lesson = session.query(Lesson).filter(Lesson.course_id == dict_for_info["course_id"]).one_or_none() + if not existing_lesson: + new_lesson = Lesson(**dict_for_info) + session.add(new_lesson) + new_aca_lesson_list.append(dict_for_info["course_id"]) + else: + existing_lesson.course_name = dict_for_info["course_name"] + existing_lesson.price = dict_for_info.get("price", None) + existing_lesson.level = dict_for_info.get("level", None) + last_version_of_lesson = session.query(Lesson).filter(Lesson.course_id == dict_for_info["course_id"]).one_or_none() + + while tutors_comp: + tut_dict = {"full_name": tutors_comp.pop(0), + "company": tutors_comp.pop(0), "lesson_id": last_version_of_lesson.lesson_id} + new_tutor = Tutor(**tut_dict) + session.add(new_tutor) + session.commit() + + +def first_scraping_step(path): + with open(path, "r", encoding="utf8") as file: + data = file.read() + + stack = [] + soup = BeautifulSoup(data, 'html.parser') + lev_val = ("intro-level", "intermediate-level", "advanced-level") + for sub_lev in lev_val: + div_1 = soup.find('div', attrs={"id": {sub_lev}}) + sub_item = div_1.find_all("a") + for item in sub_item: + if item.get("href"): + stack.append(item["href"]) + + url_list = [] + for url in stack: + if "en/" not in url: + url = f"https://aca.am/en/{url.strip('./')}" + else: + if not url.startswith("http"): + url = f"https://aca.am/{url.strip('../')}" + url_list.append(url) + for one_url in url_list: + info_for_every_lesson(one_url) + + send_email_for_new_lessons(new_aca_lesson_list, session) + + +def enter_first_aca_page(): + aca_path = "https://aca.am/en/index.html" + with urlopen(aca_path) as response: + if response.getcode() == 200: + with open("aca_en.html", "wb") as fd: + fd.write(response.read()) + + en_inspect = inspect(engine) + if not (en_inspect.has_table("lessons") and en_inspect.has_table("tutors")): + metadata.create_all(bind=engine) + + first_scraping_step("aca_en.html") + + +if __name__ == "__main__": + enter_first_aca_page() diff --git a/database.py b/database.py new file mode 100644 index 0000000..5a0feb3 --- /dev/null +++ b/database.py @@ -0,0 +1,29 @@ +from sqlalchemy import create_engine, Column, String, Integer, ForeignKey +from sqlalchemy.orm import declarative_base +from sqlalchemy.orm import relationship + + +engine = create_engine("sqlite:///lesson_and_tutor.db") +Base = declarative_base() + + +class Lesson(Base): + __tablename__ = "lessons" + lesson_id = Column(Integer, primary_key=True) + course_name = Column(String, nullable=False) + course_id = Column(String, nullable=False, unique=True) + course_url = Column(String, nullable=False) + price = Column(String) + level = Column(String) + tutors = relationship("Tutor") + + +class Tutor(Base): + __tablename__ = "tutors" + tutor_id = Column(Integer, primary_key=True) + full_name = Column(String) + company = Column(String) + lesson_id = Column(Integer, ForeignKey("lessons.lesson_id")) + + +metadata = Base.metadata diff --git a/email_sending.py b/email_sending.py new file mode 100644 index 0000000..8037a88 --- /dev/null +++ b/email_sending.py @@ -0,0 +1,33 @@ +from smtplib import SMTP_SSL +import ssl +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from getpass import getpass +from database import * + + +def send_email_for_new_lessons(new_lesson_list, session): + courses = session.query(Lesson).filter(Lesson.course_id.in_(new_lesson_list)).all() + course_info = "" + idx = 1 + for course in courses: + tutors = session.query(Tutor).filter(Tutor.lesson_id == course.lesson_id).all() + all_tutors = ",".join(f"Tutor:{tutor.full_name}-comp:{tutor.company}" for tutor in tutors) + course_info += f"{idx}) Name:{course.course_name} Price:{course.price} Level:{course.level} Link:{course.course_url} "\ + f"Tutors:{all_tutors} \n" + idx += 1 + context = ssl.create_default_context() + input_pass = getpass() + with SMTP_SSL("smtp.gmail.com", context=context) as smtp_server: + smtp_server.login("sonasah1919@gmail.com", password=input_pass) + msg_text = f"""Hi Dear user,\nPlease find below the updated list of suggested courses\n{course_info}Best wishes\nACA administration""" + context_text = MIMEText(msg_text, "plain") + message = MIMEMultipart("multipart") + message.attach(context_text) + message["Subject"] = "ACA new courses" + message["From"] = "sonasah1919@gmail.com" + message["To"] = "sonasah1919@gmail.com" + smtp_server.sendmail("sonasah1919@gmail.com", ["sonasah1919@gmail.com"], msg=message.as_string()) + + +