From 3906d4f6814073cdc5c83b88635607dcc99ff3d5 Mon Sep 17 00:00:00 2001 From: "Dr. Masroor Ehsan" Date: Wed, 12 Dec 2018 11:23:23 +0600 Subject: [PATCH] app review scraping --- README.md | 25 ++++++++++++++++++ play_scraper/__init__.py | 1 + play_scraper/api.py | 12 +++++++++ play_scraper/scraper.py | 56 ++++++++++++++++++++++++++++++++++++++++ play_scraper/settings.py | 6 +++-- requirements.txt | 1 + tests/test_scraper.py | 16 ++++++++++++ 7 files changed, 115 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aaa400b..5fdff9b 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ pip install play-scraper * [search](#search): Fetch applications matching a search query. * [similar](#similar): Fetch an application's similar apps. * [categories](#categories): Fetch a list of available categories. +* [reviews](#reviews): Fetch a list of an application's reviews. #### details @@ -273,6 +274,30 @@ Options: 'url': 'https://play.google.com/store/apps/category/ART_AND_DESIGN'}, ...} ``` +#### reviews + +Fetch a list of an application's reviews. + +Options: + +* `app_id` the app id to get, e.g. `com.android.chrome` for Google Chrome. +* `page` (default 1) the page number to fetch. + +```python +>>> import play_scraper +>>> play_scraper.reviews('com.android.chrome', 1) +[{ + 'author_image': 'https://lh3.googleusercontent.com/a-/...', + 'review_id': 'gp:AOqpTOHu4lr...', + 'review_permalink': '/store/apps/details?id=com.android.chrome&reviewId=...', + 'author_name': ' Martin Staf ', + 'review_date': 'December 5, 2018', + 'current_rating': 5, + 'review_title': '', + 'review_body': " The overflow menu at the bottom ..." +},...] +``` + ### Tests Run test: diff --git a/play_scraper/__init__.py b/play_scraper/__init__.py index 2566244..c300741 100644 --- a/play_scraper/__init__.py +++ b/play_scraper/__init__.py @@ -18,6 +18,7 @@ similar, suggestions, categories, + reviews, ) diff --git a/play_scraper/api.py b/play_scraper/api.py index 9f58c33..3b8963b 100644 --- a/play_scraper/api.py +++ b/play_scraper/api.py @@ -99,3 +99,15 @@ def categories(hl='en', gl='us', ignore_promotions=True): """ s = scraper.PlayScraper(hl, gl) return s.categories(ignore_promotions) + + +def reviews(app_id, page=1, hl='en', gl='us'): + """Sends a POST request and retrieves a list of reviews for + the specified app. + + :param app_id: the app to retrieve details from, e.g. 'com.nintendo.zaaa' + :param page: the page number to retrieve; max is 10 + :return: a list of reviews + """ + s = scraper.PlayScraper(hl, gl) + return s.reviews(app_id, page) diff --git a/play_scraper/scraper.py b/play_scraper/scraper.py index 7086ec4..edd882d 100644 --- a/play_scraper/scraper.py +++ b/play_scraper/scraper.py @@ -13,6 +13,7 @@ import requests from bs4 import BeautifulSoup, SoupStrainer +import cssutils from play_scraper import settings as s from play_scraper.constants import HL_LANGUAGE_CODES, GL_COUNTRY_CODES @@ -283,3 +284,58 @@ def categories(self, ignore_promotions=True): 'category_id': category_id} return categories + + def reviews(self, app_id, page=1): + """Sends a POST request and retrieves a list of reviews for + the specified app. + + :param app_id: the app to retrieve details from, e.g. 'com.nintendo.zaaa' + :param page: the page number to retrieve; max is 10 + :return: a list of reviews + """ + data = { + 'reviewType': 0, + 'pageNum': page, + 'id': app_id, + 'reviewSortOrder': 4, + 'xhr': 1, + 'hl': self.language + } + self.params['authuser'] = '0' + + response = send_request('POST', s.REVIEW_URL, data, self.params) + content = response.text + content = content[content.find('[["ecr"'):].strip() + data = json.loads(content) + html = data[0][2] + soup = BeautifulSoup(html, 'lxml', from_encoding='utf8') + + reviews = [] + for element in soup.select('.single-review'): + review = {} + + avatar_style = element.select_one('.author-image').get('style') + if avatar_style: + sheet = cssutils.css.CSSStyleSheet() + sheet.add('tmp { %s }' % avatar_style) + review['author_image'] = list(cssutils.getUrls(sheet))[0] + + review_header = element.select_one('.review-header') + review['review_id'] = review_header.get('data-reviewid', '') + review['review_permalink'] = review_header.select_one('.reviews-permalink').get('href') + + review['author_name'] = review_header.select_one('.author-name').text + review['review_date'] = review_header.select_one('.review-date').text + + curr_rating = review_header.select_one('.current-rating').get('style') + review['current_rating'] = int(int(str(cssutils.parseStyle(curr_rating).width).replace('%', '')) / 20) + + body_elem = element.select_one('.review-body') + review_title = body_elem.select_one('.review-title').extract() + body_elem.select_one('.review-link').decompose() + review['review_title'] = review_title.text + review['review_body'] = body_elem.text + + reviews.append(review) + + return reviews diff --git a/play_scraper/settings.py b/play_scraper/settings.py index 3f7fb15..820220a 100644 --- a/play_scraper/settings.py +++ b/play_scraper/settings.py @@ -1,8 +1,10 @@ # -*- coding: utf-8 -*- -BASE_URL = 'https://play.google.com/store/apps' +PLAYSTORE_URL = 'https://play.google.com/store' +BASE_URL = PLAYSTORE_URL + '/apps' SUGGESTION_URL = 'https://market.android.com/suggest/SuggRequest' -SEARCH_URL = 'https://play.google.com/store/search' +SEARCH_URL = PLAYSTORE_URL + '/search' +REVIEW_URL = PLAYSTORE_URL + '/getreviews' CONCURRENT_REQUESTS = 10 USER_AGENT = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) ' diff --git a/requirements.txt b/requirements.txt index f7497e8..ce9b784 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ certifi==2018.4.16 cffi==1.11.5 chardet==3.0.4 cryptography==2.3 +cssutils==1.0.2 enum34==1.1.6 futures==3.2.0;python_version<"2.7" idna==2.7 diff --git a/tests/test_scraper.py b/tests/test_scraper.py index f4057a9..7e444bf 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -73,6 +73,14 @@ 'updated', } +REVIEW_KEYS = { + 'author_name', + 'review_id', + 'review_permalink', + 'review_date', + 'review_title', + 'review_body' +} class ScraperTestBase(unittest.TestCase): def setUp(self): @@ -380,3 +388,11 @@ def test_different_language_and_country(self): categories = s.categories() self.assertTrue(all(key in categories for key in CATEGORIES)) + + +class ReviewTest(ScraperTestBase): + def test_review_ok(self): + reviews = self.s.reviews('com.android.chrome') + + self.assertGreater(len(reviews), 0) + self.assertTrue(all(key in reviews[0] for key in REVIEW_KEYS))