This repository was archived by the owner on Nov 27, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathgoogle_search.py
More file actions
executable file
·92 lines (72 loc) · 2.82 KB
/
google_search.py
File metadata and controls
executable file
·92 lines (72 loc) · 2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
# encoding: utf-8
from __future__ import unicode_literals
import logging
import urllib
import urlparse
import json
from collections import OrderedDict
import requests
LOG = logging.getLogger('sw.google_search')
def _decode_response(json_string):
response = json.loads(json_string)
meta = {key: value for key, value in response.items() if key != 'items'}
num_results = int(meta['searchInformation']['totalResults'])
if num_results == 0:
LOG.info("No search results.")
LOG.info(json.dumps(response, indent=4))
return []
else:
LOG.info("{} results.".format(num_results))
for item in response['items']:
item['meta'] = meta
return response['items']
def _strip_protocol(url):
"""
>>> _strip_protocol('http://foo.bar/blah.x?baz=10&bob=15;x')
u'foo.bar/blah.x?baz=10&bob=15;x'
"""
p = urlparse.urlparse(url)
new_url = urlparse.urlunparse(
('', p.netloc, p.path, p.params, p.query, p.fragment))
return new_url.lstrip('/')
class GoogleCustomSearch(object):
def __init__(self, search_engine_id, api_key):
self.search_engine_id = search_engine_id
self.api_key = api_key
def search(self, keyword, site=None, max_results=100):
assert isinstance(keyword, basestring)
for start_index in range(1, max_results, 10): # 10 is max page size
url = self._make_url(start_index, keyword, site)
logging.info(url)
response = requests.get(url)
if response.status_code == 403:
LOG.info(response.content)
response.raise_for_status()
for search_result in _decode_response(response.content):
yield search_result
if 'nextPage' not in search_result['meta']['queries']:
print("No more pages...")
return
def _make_url(self, start_index, keyword, restrict_to_site):
if restrict_to_site is not None:
keyword = 'site:{} {}'.format(_strip_protocol(restrict_to_site),
keyword)
# https://developers.google.com
# /custom-search/json-api/v1/reference/cse/list
params = OrderedDict([
('cx', self.search_engine_id),
('key', self.api_key),
('rsz', '10'),
('num', '10'),
('googlehost', 'www.google.com'),
('gss', '.com'),
('q', keyword),
('oq', keyword),
('filter', '0'), # duplicate content filter, 1 | 0
('safe', 'off'), # strict | moderate | off
])
#if restrict_to_site is not None:
# params['siteSearch'] = _strip_protocol(restrict_to_site)
return 'https://www.googleapis.com/customsearch/v1?{}'.format(
urllib.urlencode(params))