From caa1cc8f94bdb57e0a2309cc376c7cccad5b041d Mon Sep 17 00:00:00 2001 From: "github-classroom[bot]" <66690702+github-classroom[bot]@users.noreply.github.com> Date: Thu, 18 Mar 2021 15:04:29 +0000 Subject: [PATCH 1/4] Setting up GitHub Classroom Feedback From 898733815c8493acaa7271af2d5530c781751827 Mon Sep 17 00:00:00 2001 From: Sonasah96 Date: Mon, 29 Mar 2021 18:57:13 +0400 Subject: [PATCH 2/4] first_one --- Scraping.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++ database.py | 31 +++++++++++++++ email_sending.py | 33 ++++++++++++++++ 3 files changed, 163 insertions(+) create mode 100644 Scraping.py create mode 100644 database.py create mode 100644 email_sending.py diff --git a/Scraping.py b/Scraping.py new file mode 100644 index 0000000..0070ae6 --- /dev/null +++ b/Scraping.py @@ -0,0 +1,99 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import os +from email_sending import * +from database import * +from sqlalchemy.orm import sessionmaker + + +session = sessionmaker(bind=engine)() +new_aca_lesson_list = [] + + +def info_for_every_lesson(url): + with urlopen(url) as resp: + data = resp.read() + + dict_for_info = {} + + soup = BeautifulSoup(data, 'html.parser') + div = soup.find("div", {"id": "header"}) + dict_for_info["course_name"] = div.h1.string + + dict_for_info["course_id"] = (os.path.basename(url)).split(".")[0] + dict_for_info["course_url"] = url + + tab = soup.find("table", attrs={"class": "table"}) + string_in_tab = tuple(tab.stripped_strings) + if 'Price:' in string_in_tab or 'Tuition fee:' in string_in_tab: + if 'Price:' in string_in_tab: + idx = string_in_tab.index('Price:') + else: + idx = string_in_tab.index('Tuition fee:') + dict_for_info["price"] = string_in_tab[idx + 1].strip("*") + if 'Level:' in string_in_tab: + idx_1 = string_in_tab.index('Level:') + dict_for_info["level"] = string_in_tab[idx_1 + 1] + + div_1 = soup.find("div", attrs={"id": "tutors"}) + tutors_comp = list(div_1.stripped_strings) + + existing_lesson = session.query(Lesson).filter(Lesson.course_id == dict_for_info["course_id"]).one_or_none() + if not existing_lesson: + new_lesson = Lesson(**dict_for_info) + session.add(new_lesson) + new_aca_lesson_list.append(dict_for_info["course_id"]) + else: + existing_lesson.course_name = dict_for_info["course_name"] + existing_lesson.price = dict_for_info.get("price", None) + existing_lesson.level = dict_for_info.get("level", None) + last_version_of_lesson = session.query(Lesson).filter(Lesson.course_id == dict_for_info["course_id"]).one_or_none() + + while tutors_comp: + tut_dict = {"full_name": tutors_comp.pop(0), + "company": tutors_comp.pop(0), "lesson_id": last_version_of_lesson.lesson_id} + new_tutor = Tutor(**tut_dict) + session.add(new_tutor) + session.commit() + + +def first_scraping_step(path): + with open(path, "r", encoding="utf8") as file: + data = file.read() + + stack = [] + soup = BeautifulSoup(data, 'html.parser') + lev_val = ("intro-level", "intermediate-level", "advanced-level") + for sub_lev in lev_val: + div_1 = soup.find('div', attrs={"id": {sub_lev}}) + sub_item = div_1.find_all("a") + for item in sub_item: + if item.get("href"): + stack.append(item["href"]) + + url_list = [] + for url in stack: + if "en/" not in url: + url = f"https://aca.am/en/{url.strip('./')}" + else: + if not url.startswith("http"): + url = f"https://aca.am/{url.strip('../')}" + url_list.append(url) + print(url_list) + for one_url in url_list: + info_for_every_lesson(one_url) + + send_email_for_new_lessons(new_aca_lesson_list, session) + + +def enter_first_aca_page(): + aca_path = "https://aca.am/en/index.html" + with urlopen(aca_path) as response: + if response.getcode() == 200: + with open("aca_en.html", "wb") as fd: + fd.write(response.read()) + first_scraping_step("aca_en.html") + + +if __name__ == "__main__": + enter_first_aca_page() diff --git a/database.py b/database.py new file mode 100644 index 0000000..e965152 --- /dev/null +++ b/database.py @@ -0,0 +1,31 @@ +from sqlalchemy import create_engine, Column, String, Integer, ForeignKey +from sqlalchemy.orm import declarative_base +from sqlalchemy.orm import relationship + +engine = create_engine("sqlite:///lesson_and_tutor.db") +Base = declarative_base() + + +class Lesson(Base): + __tablename__ = "lessons" + lesson_id = Column(Integer, primary_key=True) + course_name = Column(String, nullable=False) + course_id = Column(String, nullable=False, unique=True) + course_url = Column(String, nullable=False) + price = Column(String) + level = Column(String) + tutors = relationship("Tutor") + + +class Tutor(Base): + __tablename__ = "tutors" + tutor_id = Column(Integer, primary_key=True) + full_name = Column(String) + company = Column(String) + lesson_id = Column(Integer, ForeignKey("lessons.lesson_id")) + + +metadata = Base.metadata + +if __name__ == "__main__": + metadata.create_all(bind=engine) diff --git a/email_sending.py b/email_sending.py new file mode 100644 index 0000000..8037a88 --- /dev/null +++ b/email_sending.py @@ -0,0 +1,33 @@ +from smtplib import SMTP_SSL +import ssl +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from getpass import getpass +from database import * + + +def send_email_for_new_lessons(new_lesson_list, session): + courses = session.query(Lesson).filter(Lesson.course_id.in_(new_lesson_list)).all() + course_info = "" + idx = 1 + for course in courses: + tutors = session.query(Tutor).filter(Tutor.lesson_id == course.lesson_id).all() + all_tutors = ",".join(f"Tutor:{tutor.full_name}-comp:{tutor.company}" for tutor in tutors) + course_info += f"{idx}) Name:{course.course_name} Price:{course.price} Level:{course.level} Link:{course.course_url} "\ + f"Tutors:{all_tutors} \n" + idx += 1 + context = ssl.create_default_context() + input_pass = getpass() + with SMTP_SSL("smtp.gmail.com", context=context) as smtp_server: + smtp_server.login("sonasah1919@gmail.com", password=input_pass) + msg_text = f"""Hi Dear user,\nPlease find below the updated list of suggested courses\n{course_info}Best wishes\nACA administration""" + context_text = MIMEText(msg_text, "plain") + message = MIMEMultipart("multipart") + message.attach(context_text) + message["Subject"] = "ACA new courses" + message["From"] = "sonasah1919@gmail.com" + message["To"] = "sonasah1919@gmail.com" + smtp_server.sendmail("sonasah1919@gmail.com", ["sonasah1919@gmail.com"], msg=message.as_string()) + + + From 2e86750ee6b68691c1db3cee74451359d7f1f648 Mon Sep 17 00:00:00 2001 From: Sonasah96 Date: Mon, 29 Mar 2021 18:57:55 +0400 Subject: [PATCH 3/4] second_one --- Scraping.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Scraping.py b/Scraping.py index 0070ae6..0794661 100644 --- a/Scraping.py +++ b/Scraping.py @@ -79,7 +79,6 @@ def first_scraping_step(path): if not url.startswith("http"): url = f"https://aca.am/{url.strip('../')}" url_list.append(url) - print(url_list) for one_url in url_list: info_for_every_lesson(one_url) From 862666a2ee3b85fab19f38ea60710feabfbaed1a Mon Sep 17 00:00:00 2001 From: Sonasah96 Date: Tue, 30 Mar 2021 13:01:53 +0400 Subject: [PATCH 4/4] 2nd commit --- Scraping.py | 7 ++++++- database.py | 4 +--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Scraping.py b/Scraping.py index 0794661..e911c03 100644 --- a/Scraping.py +++ b/Scraping.py @@ -4,7 +4,7 @@ from email_sending import * from database import * from sqlalchemy.orm import sessionmaker - +from sqlalchemy.inspection import inspect session = sessionmaker(bind=engine)() new_aca_lesson_list = [] @@ -91,6 +91,11 @@ def enter_first_aca_page(): if response.getcode() == 200: with open("aca_en.html", "wb") as fd: fd.write(response.read()) + + en_inspect = inspect(engine) + if not (en_inspect.has_table("lessons") and en_inspect.has_table("tutors")): + metadata.create_all(bind=engine) + first_scraping_step("aca_en.html") diff --git a/database.py b/database.py index e965152..5a0feb3 100644 --- a/database.py +++ b/database.py @@ -2,6 +2,7 @@ from sqlalchemy.orm import declarative_base from sqlalchemy.orm import relationship + engine = create_engine("sqlite:///lesson_and_tutor.db") Base = declarative_base() @@ -26,6 +27,3 @@ class Tutor(Base): metadata = Base.metadata - -if __name__ == "__main__": - metadata.create_all(bind=engine)