-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscapingtest.py
More file actions
48 lines (39 loc) · 1.57 KB
/
scapingtest.py
File metadata and controls
48 lines (39 loc) · 1.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import requests
from bs4 import BeautifulSoup
import json
import threading
import mysql.connector
db_connection = mysql.connector.connect(
host="localhost",
user="root",
passwd="",
database="scrapiusdb"
)
db_cursor = db_connection.cursor(buffered=True)
# print(db_connection)
# ======================================================================================================================
def scrap_handler(url, m_attrs):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
print(type(soup.findAll(m_attrs["parent"]["type"], m_attrs["parent"]["atr"])))
for eachItem in reversed(soup.findAll(m_attrs["parent"]["type"], m_attrs["parent"]["atr"])):
m_values = {};
for keys in m_attrs:
if keys != 'parent':
m_values[keys] = eachItem.find(m_attrs[keys]["type"]).text
query = "Select * from scrapeddata where Site=(%s) AND user='aa' AND title=(%s) Limit 1"
db_cursor.execute(query, (url, m_values['heading']))
myresult = db_cursor.fetchall()
print(m_values['heading'])
if len(myresult) == 0:
query = "Insert into scrapeddata (Site, user, data, title) values (%s, 'aa', %s, %s)"
db_cursor.execute(query, (url, str(m_values), m_values['heading']))
db_connection.commit()
#
# READING JSON FILE - SCHEMA
#
with open('userbase/test/schema.json', 'r') as file:
json_data = file.read()
mSiteList = json.loads(json_data)
for site in mSiteList.keys():
threading.Thread(target=scrap_handler, args=(site, mSiteList[site])).start()