-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingredient_analysis.py
More file actions
153 lines (116 loc) · 4.14 KB
/
ingredient_analysis.py
File metadata and controls
153 lines (116 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import bs4
import json
import re
from recipe_scraper import *
from tools_list import *
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import nltk
descriptors = [
'chopped',
'minced',
'cut',
'or',
'sliced',
'rinsed',
'diced'
]
def scrape_ingredients(recipe_url):
# url of award show wiki page
my_url = recipe_url
# grab webpage html
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
recipe_script = []
ingredient_list = []
# find the ingredients by html tag
recipe_script = page_soup.findAll('span', {"class": "recipe-ingred_txt added"})
for ingredient in recipe_script:
text = ingredient.text.strip()
ingredient_list.append(text)
return ingredient_list
def scrape_instructions(recipe_url):
my_url = recipe_url
# grab webpage html
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
recipe_script = []
instructions_list = []
# find the instructions by html tag
recipe_script = page_soup.findAll('span', {"class": "recipe-directions__list--item"})
for step in recipe_script:
text = step.text.strip()
instructions_list.append(text)
return instructions_list
def get_ingredients_data(ingredients):
# a list of tuples, where the first item in the tuple is a list of the quantities, the second is a list of measurements, and the third is a string with the meaningful part of the food (everything else parsed out)
ingredients_data = []
for words in ingredients:
quantity = []
measurement = []
food = ''
# if parens are in the ingredient, delete them
if re.search('\(', words):
# get rid of trademark
words = re.sub('\(R\)', '', words)
# get rid of other parentheses
words = re.sub(r'\s\([^)]*\)', '', words)
words_list = words.split()
for word in words_list:
# if it's a quantity
if re.search('[1-9]', word) or re.search('[1-9]\/[1-9]', word) or re.search('\([1-9]+ [a-z]+\)', word):
quantity.append(word)
# if it's a measure
elif word in ['teaspoon', 'teaspoons', 'cup', 'cups', 'ounce', 'ounces', 'clove', 'pound', 'tablespoons', 'container', 'package', 'tablespoon', 'bunch', 'can', 'cans', 'pounds']:
measurement.append(word)
# otherwise it's part of food so create a string that has the entire food part of the ingredient
else:
if food == '':
food += word
else:
food += ' ' + word
if len(quantity) > 1:
quantity = ['+'.join(x for x in quantity)]
# now we have a string of the food part of the ingredient. let's take out stuff we don't need
m = re.findall(r"((\w+ ?-?)+)", food)
match = [x[0] for x in m]
if len(match) > 1:
found = False
for descriptor in descriptors:
if descriptor in match[1]:
found = True
if not found:
food = match[1]
else:
food = match[0]
else:
food = match[0]
ingredients_data.append(['Quantity:', quantity, 'Measurement:', measurement, 'Ingredient:', food])
return ingredients_data
def analyzeIngredients(start_url):
analysis_res = []
ingredients_list = scrape_ingredients(start_url)
recipe_list = scrape_instructions(start_url)
extracted_ingredients = get_ingredients_data(ingredients_list)
for ingredient_list in extracted_ingredients:
descriptor = []
preparation = []
food_list = ingredient_list[-1]
text = nltk.word_tokenize(food_list)
for tag in nltk.pos_tag(text):
if tag[1] == 'JJ':
descriptor.append(tag[0])
if tag[1] == 'VBN':
preparation.append(tag[0])
analysis_res.append(['Recipes:', ingredient_list, 'Descriptor:', descriptor, 'Preparation:', preparation])
analysis_res.append(['Primary Methods:', extractMethods(recipe_list)])
analysis_res.append(['Tools:', makeToolsList(start_url)])
return analysis_res
# start_url = 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/?internalSource=previously%20viewed&referringContentType=home%20page&clickId=cardslot%203'
# analysis_res = ingredient_analysis(start_url)