recipes/ingredient_analysis.py at master · SwammyD/recipes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import bs4
import json
import re
from recipe_scraper import *
from tools_list import *
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import nltk

descriptors = [
	'chopped',
	'minced',
	'cut',
	'or',
	'sliced',
	'rinsed',
	'diced'
]

def scrape_ingredients(recipe_url):
	# url of award show wiki page
	my_url = recipe_url


	# grab webpage html
	uClient = uReq(my_url)
	page_html = uClient.read()
	uClient.close()

	# html parsing
	page_soup = soup(page_html, "html.parser")

	recipe_script = []
	ingredient_list = []

	# find the ingredients by html tag
	recipe_script = page_soup.findAll('span', {"class": "recipe-ingred_txt added"})
	for ingredient in recipe_script:
		text = ingredient.text.strip()
		ingredient_list.append(text)
	return ingredient_list

def scrape_instructions(recipe_url):
	my_url = recipe_url


	# grab webpage html
	uClient = uReq(my_url)
	page_html = uClient.read()
	uClient.close()

	# html parsing
	page_soup = soup(page_html, "html.parser")

	recipe_script = []
	instructions_list = []

	# find the instructions by html tag
	recipe_script = page_soup.findAll('span', {"class": "recipe-directions__list--item"})
	for step in recipe_script:
		text = step.text.strip()
		instructions_list.append(text)
	return instructions_list


def get_ingredients_data(ingredients):

	# a list of tuples, where the first item in the tuple is a list of the quantities, the second is a list of measurements, and the third is a string with the meaningful part of the food (everything else parsed out)
	ingredients_data = []

	for words in ingredients:
		quantity = []
		measurement = []
		food = ''

		# if parens are in the ingredient, delete them
		if re.search('\(', words):
			# get rid of trademark
			words = re.sub('\(R\)', '', words)
			# get rid of other parentheses
			words = re.sub(r'\s\([^)]*\)', '', words)

		words_list = words.split()
		for word in words_list:
			# if it's a quantity
			if re.search('[1-9]', word) or re.search('[1-9]\/[1-9]', word) or re.search('\([1-9]+ [a-z]+\)', word):
				quantity.append(word)
			# if it's a measure
			elif word in ['teaspoon', 'teaspoons', 'cup', 'cups', 'ounce', 'ounces', 'clove', 'pound', 'tablespoons', 'container', 'package', 'tablespoon', 'bunch', 'can', 'cans', 'pounds']:
				measurement.append(word)
			# otherwise it's part of food so create a string that has the entire food part of the ingredient
			else:
				if food == '':
					food += word
				else:
					food += ' ' + word
			if len(quantity) > 1:
				quantity = ['+'.join(x for x in quantity)]

		# now we have a string of the food part of the ingredient. let's take out stuff we don't need
		m = re.findall(r"((\w+ ?-?)+)", food)
		match = [x[0] for x in m]

		if len(match) > 1:
			found = False
			for descriptor in descriptors:
				if descriptor in match[1]:
					found = True
					if not found:
						food = match[1]
					else:
						food = match[0]
				else:
					food = match[0]

		ingredients_data.append(['Quantity:', quantity, 'Measurement:', measurement, 'Ingredient:', food])

	return ingredients_data


def analyzeIngredients(start_url):
	analysis_res = []
	ingredients_list = scrape_ingredients(start_url)
	recipe_list = scrape_instructions(start_url)
	extracted_ingredients = get_ingredients_data(ingredients_list)

	for ingredient_list in extracted_ingredients:
		descriptor = []
		preparation = []
		food_list = ingredient_list[-1]
		text = nltk.word_tokenize(food_list)
		for tag in nltk.pos_tag(text):
			if tag[1] == 'JJ':
				descriptor.append(tag[0])
			if tag[1] == 'VBN':
				preparation.append(tag[0])
		analysis_res.append(['Recipes:', ingredient_list, 'Descriptor:', descriptor, 'Preparation:', preparation])
	analysis_res.append(['Primary Methods:', extractMethods(recipe_list)])
	analysis_res.append(['Tools:', makeToolsList(start_url)])
	return analysis_res


# start_url = 'https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/?internalSource=previously%20viewed&referringContentType=home%20page&clickId=cardslot%203'
# analysis_res = ingredient_analysis(start_url)