-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathWebCrawler.py
More file actions
103 lines (87 loc) · 3.53 KB
/
WebCrawler.py
File metadata and controls
103 lines (87 loc) · 3.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import requests
import bs4
import time
def recipeCollector():
# Extract recipe URLs
recipe_urls = []
ingredient_list = ['avocado', 'chocolate','kale', 'tofu', 'quinoa', 'blueberries', 'mushrooms', 'zucchini']
for ingredient in ingredient_list:
base_url = f"https://simple-veganista.com/tag/{ingredient}/"
response = requests.get(base_url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
entry_soup = soup.find_all('h2', class_='entry-title')
for content in entry_soup:
recipe_urls.append(content.find_all('a')[0].attrs['href'])
print("URLs Collected!")
print(recipe_urls)
print("================================")
time.sleep(10)
# Get recipe information from each URL
counter = 0
for recipe_url in recipe_urls:
response = requests.get(recipe_url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
# Get information from the URL.
result = []
# Name
name_soup = soup.find_all('h2', class_='tasty-recipes-title')
name_content = name_soup[0].get_text()
result.append(name_content)
# Detailed Ingredients
ingredient_details_soup = soup.find_all('div', class_ = 'tasty-recipes-ingredients-body')
if (ingredient_details_soup != []):
ingredients_details_content = ingredient_details_soup[0].get_text().strip("\n")
else:
ingredients_details_content = "N/A"
result.append(ingredients_details_content)
# Ingredients
if (ingredient_details_soup != []):
ingredients_content = ingredient_details_soup[0].find_all('strong')
if (ingredients_content != []):
ingredients = ''
for tag in ingredients_content:
ingredients += tag.get_text()
ingredients += ', '
else:
ingredients = "N/A"
else:
ingredients = "N/A"
result.append(ingredients.strip(', '))
# Instructions
instructions_soup = soup.find_all('div', class_ = 'tasty-recipes-instructions-body')
if (instructions_soup != []):
instructions_content = instructions_soup[0].get_text()
else:
instructions_content = "N/A"
result.append(instructions_content)
# Yield
yield_soup = soup.find_all('li', class_='yield')
if (yield_soup != []):
yield_content = yield_soup[0].get_text().strip('Yield: ')
else:
yield_content = "N/A"
result.append(yield_content)
# Cook time
cooktime_soup = soup.find_all('li', class_='total-time')
if (cooktime_soup != []):
cooktime_content = cooktime_soup[0].get_text()
else:
cooktime_content = "N/A"
result.append(cooktime_content)
# Collect all the information
print_result = "***Recipe***\n"
print_result += result[0] + '///\n'
print_result += result[2] + '///\n'
print_result += result[1] + '///\n'
print_result += result[3] + '///\n'
print_result += result[4] + '///\n'
print_result += result[5] + '///\n'
print_result += "******\n"
# Write to the file
fp = open('RecipeDatabase.txt', 'a')
fp.write(print_result)
fp.close()
counter += 1
print(f"Recipe {counter} Collected!")
time.sleep(10)
recipeCollector()