quote_scraper/quote_scraper.py at master · jks7743/quote_scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""
A web scraper program used to scrape
http://leagueoflegends.wikia.com/wiki/List_of_champions
for champion names, quotes and images using the BeautifulSoup4 and Requests
packages. The program outputs the information to a google spreadsheet for use
with the OKRammus API

The program creates a list of dictionaries to be uploaded to an online database
each entry has the format.
	champ_dict:
		str Name : 'champ_name'
		str Link : 'champ_Link'
		str Image : 'image_link'
		str Quotes : ['"champ quote"',...]
		str PrettyName : 'champ name'

May 2018
"""
__author__ = "Joshua Schenk"

import requests
import html5lib
from bs4 import BeautifulSoup

"""
Constructs a list of dictionaries represnting all the champions listed in the
league of legends wikia page
http://leagueoflegends.wikia.com/wiki/List_of_champions

:return:
	a list of dictionaries that are internal representations of champions
		dict:
			str Name : 'champ_name'
			str Link : 'champ_link'
			str Image : 'image_link'
			str Quotes : ['"champ quote"',...]
			str PrettyName : 'champ name'
"""
def get_champs():
	champ_links = []
	source = 'http://leagueoflegends.wikia.com/wiki/List_of_champions'
	html = requests.get(source)
	champ_soup = BeautifulSoup(html.text, 'html5lib')
	champ_table = champ_soup.find('table', class_='wikitable sortable')	# find the right table
	champList = champ_table.find('tbody')
	for tr in champList.find_all('tr'):
		champ_dict = {}
		champ_data = tr.find('a')
		page_link = str(champ_data.get('href'))
		champ_dict['Name'] = page_link.rsplit('/')[2]	# get a link formated version of a champions name
		champ_links.append(champ_dict)
		champ_dict['Link'] = 'http://leagueoflegends.wikia.com' + page_link	# format link
	del champ_links[0]	# delete unnecessary link due to table formating
	for champ in champ_links:
		champ['Image'] = get_champ_image(champ)
		champ['Quotes'] = get_champ_quotes(champ)	# get champ quotes
		champ['PrettyName'] = get_pretty_name(champ)
	return champ_links

"""
Function finds and returns a list of all the quotes a champion has based on a
given wikia link

:param champ_dict: a dictionary entry of a champion
:return: a list of champion quotes
"""
def get_champ_quotes(champ_dict):
	champ_quotes = []
	champ_link = champ_dict['Link']	# get a champ's link
	html = requests.get(champ_link + '/Quotes')	# go to Quote page for easy parsing
	soup = BeautifulSoup(html.text, 'html5lib')
	for i in soup.find_all('i'):
		champ_quote = str(i.string)	# make thee quote a string
		if champ_quote[0] is '"':	# filter out non-quote or noise lines
			champ_quotes.append(champ_quote)	# add quote to array
	return champ_quotes

"""
Finds a link to the image of a champion based on a given dictionary

:param champ_dict: a dictionary entry of a champion
:return: link to the image of a champion
"""
def get_champ_image(champ_dict):
	print(champ_dict['Name'])
	champ_image_link = 'http://leagueoflegends.wikia.com/wiki' + '/File:' + champ_dict['Name'] + '_OriginalSkin.jpg'
	html = requests.get(champ_image_link)
	soup = BeautifulSoup(html.text, 'html5lib')
	image_parent = soup.find('div', class_='fullImageLink')
	image_link = image_parent.find('a')
	image = image_link.get('href')
	return image

"""
Small helper function that replaces underscores and %27 with space and '
characters and returns the pretty string

:param champ_dict: a dictionary entry of a champion
:return: a pretty version of a champions name
"""
def get_pretty_name(champ_dict):
	champ_name = str(champ_dict['Name'])
	pretty_name = champ_name.replace('_', ' ')
	pretty_name = pretty_name.replace('%27','\'')
	return pretty_name