-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebScraper.py
More file actions
166 lines (113 loc) · 4.63 KB
/
WebScraper.py
File metadata and controls
166 lines (113 loc) · 4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import bs4
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import csv
import concurrent.futures
# In[2]:
#function to count the number of pages associated with the search
def pages(url):
global count
params = {'api_key':'42e2a129e57389d742a5f8564fb814bf', 'url': url}
page=requests.get('http://api.scraperapi.com/',params=urlencode(params))
soup =BeautifulSoup(page.content,'html.parser')
results = soup.find_all('a',class_='d-block text-center clickable')
count=0
for r in results:
try:
if int(r.text) not in list:
#print(int(r.text))
list.append(int(r.text))
count+=1
except:
continue
if count == 0:
return
else:
pages("https://www.talabat.com/uae/restaurants/1308/jumeirah-lakes-towers-jlt?&page=%d"%(list[-1]))
# In[3]:
list=[] #to count the number of pages
#pass the baseurl of the area to the pages function
pages("https://www.talabat.com/uae/restaurants/1308/jumeirah-lakes-towers-jlt")
# In[4]:
#store the base url in url_list
url_list=["https://www.talabat.com/uae/restaurants/1308/jumeirah-lakes-towers-jlt"]
for i in list:
url_list.append("https://www.talabat.com/uae/restaurants/1308/jumeirah-lakes-towers-jlt?&page=%d"%(i))
# In[5]:
#column headers for the table
header=["brand_name",
"cuisine_tags",
"restaurant_rating",
"delivery_time",
"service_fee",
"minimum_order_amount",
"new_restaurant"]
with open('restaurant_data.csv', mode='w') as rest_data:
data_writer = csv.writer(rest_data)
data_writer.writerow(header)
# In[6]:
data=[] #list to store details of all Restaurants
# In[7]:
def insert_data(url):
params = {'api_key':'42e2a129e57389d742a5f8564fb814bf', 'url': url}
page=requests.get('http://api.scraperapi.com/',params=urlencode(params))
soup =BeautifulSoup(page.content,'html.parser')
#print(soup.prettify())
results = soup.find_all('div',class_='list-itemstyles__VendorListItemContainer-ia2hbn-0 eLZatB')
for res in results:
obj=[] #list to store details at per-restaurant level
#extract restaurant name
brand_name=res.find('div',class_="restaurant-title pb-1")
obj.append(brand_name.text)
#extract cuisines and add to cuisine_tags array
cuisine_tags=res.find('div',class_="cuisines-section pb-1 truncate") #get cuisine details
str=cuisine_tags.text.replace(',',"").split()
cuisine_tags_arr=[] # array of restaraunt cuisine
for i in str:
if i != "and" and i != "&":
cuisine_tags_arr.append(i)
obj.append(cuisine_tags_arr)
#extract restaurant information
restaurant_info=res.find('div',class_="ratings-and-new-section pb-1 d-flex")
#check if restaurant is NEW
new_info=restaurant_info.find('div',class_="new-restaurant-label f-10 text-center mr-2 f-500")
if new_info is None:
new_flag=False
else:
#print(new_info.text)
new_flag=True
#extract rating_info
rating_info=res.find('div',class_="rating-displaystyles__RatingDisplayContainer-sc-19r5mol-0 jabHoj")
if rating_info is not None:
obj.append(rating_info.text)
#print(rating_info.text)
else:
obj.append("NA")
#extract restaurant delivery information i.e., delivery_time,service_fee,minimum_order_amount
delivery_info=res.find('div',class_="info-section pb-1 f-14 delivery-info d-flex")
delivery_time=delivery_info.find('span',class_="mr-2")
service_fee=delivery_time.find_next_sibling("span","mr-2")
minimum_order_amount=delivery_info.find('span',class_="d-sm-block")
obj.append(int(delivery_time.text.split(" ")[1]))
#check if service fee is free then add fee as 0 otherwise add the parsed fee
if service_fee.text != "Free":
obj.append(int(float(service_fee.text)))
else:
obj.append(0)
obj.append(int(float(minimum_order_amount.text.split()[1])))
obj.append(new_flag)
data.append(obj)
# In[8]:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
executor.map(insert_data, url_list)
# In[9]:
#store the scraped data in csv file
with open('restaurant_data.csv', mode='a') as rest_data:
data_writer = csv.writer(rest_data)
for d in data:
data_writer.writerow(d)
# In[ ]: