MyFridge-InventoryManagement/WebCrawler.py at main · Yuyan-Lei/MyFridge-InventoryManagement · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import requests
import bs4
import time

def recipeCollector():

    # Extract recipe URLs
    recipe_urls = []
    ingredient_list = ['avocado', 'chocolate','kale', 'tofu', 'quinoa', 'blueberries', 'mushrooms', 'zucchini']
    for ingredient in ingredient_list:
        base_url = f"https://simple-veganista.com/tag/{ingredient}/"
        response = requests.get(base_url)
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        entry_soup = soup.find_all('h2', class_='entry-title')
        for content in entry_soup:
            recipe_urls.append(content.find_all('a')[0].attrs['href'])

    print("URLs Collected!")
    print(recipe_urls)
    print("================================")
    time.sleep(10)

    # Get recipe information from each URL
    counter = 0
    for recipe_url in recipe_urls:
        response = requests.get(recipe_url)
        soup = bs4.BeautifulSoup(response.text, 'html.parser')

        # Get information from the URL.
        result = []
        # Name
        name_soup = soup.find_all('h2', class_='tasty-recipes-title')
        name_content = name_soup[0].get_text()
        result.append(name_content)

        # Detailed Ingredients
        ingredient_details_soup = soup.find_all('div', class_ = 'tasty-recipes-ingredients-body')
        if (ingredient_details_soup != []):
            ingredients_details_content = ingredient_details_soup[0].get_text().strip("\n")
        else:
            ingredients_details_content = "N/A"
        result.append(ingredients_details_content)

        # Ingredients
        if (ingredient_details_soup != []):
            ingredients_content = ingredient_details_soup[0].find_all('strong')
            if (ingredients_content != []):
                ingredients = ''
                for tag in ingredients_content:
                    ingredients += tag.get_text()
                    ingredients += ', '
            else:
                ingredients = "N/A"
        else:
            ingredients = "N/A"
        result.append(ingredients.strip(', '))

        # Instructions
        instructions_soup = soup.find_all('div', class_ = 'tasty-recipes-instructions-body')
        if (instructions_soup != []):
            instructions_content = instructions_soup[0].get_text()
        else:
            instructions_content = "N/A"
        result.append(instructions_content)

        # Yield
        yield_soup = soup.find_all('li', class_='yield')
        if (yield_soup != []):
            yield_content = yield_soup[0].get_text().strip('Yield: ')

        else:
            yield_content = "N/A"
        result.append(yield_content)

        # Cook time
        cooktime_soup = soup.find_all('li', class_='total-time')
        if (cooktime_soup != []):
            cooktime_content = cooktime_soup[0].get_text()
        else:
            cooktime_content = "N/A"
        result.append(cooktime_content)

        # Collect all the information
        print_result = "***Recipe***\n"
        print_result += result[0] + '///\n'
        print_result += result[2] + '///\n'
        print_result += result[1] + '///\n'
        print_result += result[3] + '///\n'
        print_result += result[4] + '///\n'
        print_result += result[5] + '///\n'
        print_result += "******\n"

        # Write to the file
        fp = open('RecipeDatabase.txt', 'a')
        fp.write(print_result)
        fp.close()

        counter += 1
        print(f"Recipe {counter} Collected!")
        time.sleep(10)


recipeCollector()