WebScrapper/main.py at master · Buczman/WebScrapper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
#####################################################
#					WEBSCRAPPING					#
#			POLISH CONSTITUTIONAL TRIBUNAL			#
#													#
#		  MATEUSZ BUCZYNSKI & OSKAR RYCHLICA		#
#####################################################

#####################################################
# REQUIREMENTS:
#####################################################

# *Please download wkthmltopdf.exe, install it and specify
# path to it in variable pathwkthmltopdf.
# *Please install newest selenium driver
# *Please download geckodriver and specify path to it in
# variable driver(executable_path=...)

####################################################

# Below code is aimed at scraping jurisdiction accompanied
# by separate opinions.

# Specify filters and necessary parameters at PARAMETRIZATION
# section!!!

# Each method is described more thoroughly throughout the code

#####################################################
# OUTPUT:
#####################################################

# Output is as below:
# - outputL - main list containing:

# * outputLDict - list of dictionaries in a form of JSON
# containing fields:
# ** id		 - id of a jurisdiction
# ** link	 - direct link to the jurisdiction
# ** sign	 - signature name of a jurisdiction
# ** sep_opi - list of (if available) separate opinions
			 # in a form of dictionaries with fields:
			   # *** link - direct link to the separate
						# opinion
			   # *** by	  - name and surname of the
						# separate opinion's author

# * mostcommon5 - list of tuples with 5 most active
# authors in separate opinions in a form of:
# (name , number of separate opinions)

# * file output saved in folders in a following way:
# ** /ID_SIGNATURE - here are all PDF and HTML files
   # relating to a separate jurisdiction stored, each file
   # named as ID_SINGATURE.PDF (.HTML)
# ** /ID_SIGNATURE/separate_opinions - here are all
   # PDF and HTML files relating to each separate opinion
   # of a single jurisdiction stored, named as
   # ID_SIGNATURE_BY.PDF (.HTML)

# Program also produces a log file containing DEBUG info.

#####################################################
# IMPORTS
#####################################################

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
import time
import pdfkit
import json
import collections
import os
import sys
import timeit
import datetime
import glob
from progress.bar import IncrementalBar
from datetime import timedelta
from functions import *
pLoggerInit()

#####################################################
# CONSTANTS
#####################################################

outputDict = {} # output dict
pageCount=0
objectCount=0
sepPageCount=0
sepObjectCount=0

#####################################################
# PARAMETRIZATION
#####################################################

# here you should define starting page of Tribunal Search and webpage of separate opinions
jurisdictionURL = 'http://ipo.trybunal.gov.pl/ipo/Szukaj'
sepOpinionURL = 'http://ipo.trybunal.gov.pl/ipo/SzukajZO'
mainOutputDirectory = "output/"

# True - print also PDF versions of pages, False (default) - no PDF output
isPDFOutput = False

try:
	options = Options()
	options.set_headless(headless=True)
	driver = webdriver.Firefox(firefox_options=options, executable_path = 'C:\Gecko\geckodriver.exe') # defining main driver object, using Firefox browser as default
	if isPDFOutput:
		pathwkthmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'	 # defining wkthtml driver
		PDFconfig = pdfkit.configuration(wkhtmltopdf=pathwkthmltopdf)
except:
	pLogger('log.txt', True, '[ERROR] Drivers not located!')
	raise
wait = WebDriverWait(driver, 30)										 # defining wait object with timeout of 30 s

#initializing logger

pLogger('log.txt', True, '[DEBUG] Drivers located!')

# Diagnosis stage
# To pick a particular "Diagnosis stage" specify a below argument:
# 0 - "Proper diagnosis stage"/"Rozpoznanie właściwe"
# 1 - "Signalization"/"Sygnalizacja"
# 2 - "Preliminary control"/"Wstępna kontrola"
diagnosisStageSel = 0
diagnosisStageSelDict = {0:'Proper diagnosis stage',
						 1:'Signalization',
						 2:'Preliminary control'}

# Time range
# To pick a particular "Time range" specify a below argument:
# 0 - "Last year"/"Ostatni rok"
# 1 - "Last 5 years"/"Ostatnie 5 lat"
# 2 - "Last 10 years"/"Ostatnie 10 lat"
# 3 - "Since 16.10.1997"/"od 16.10.1997 roku"
# 4 - "Since 1986"/"Od 1986 roku"
timeRangeSel = 3
timeRangeSelDict = {0:'Last year',
					1:'Last 5 years',
					2:'Last 10 years',
					3: 'Since 16.10.1997',
					4:'Since 1986'}

#####################################################
# PAGE LOAD & OBJECT FILTERING
#####################################################

pLogger('log.txt', True, '[DEBUG] Opening webpage: ', jurisdictionURL)
driver.get(jurisdictionURL) # opening main page
pLogger('log.txt', True, '[DEBUG] Page loaded! ', jurisdictionURL)

# waiting for filters to load - choosing one onlyu if it is not chosen already
diagnosisStage = wait.until(EC.visibility_of_element_located(
							 (By.ID,
							  'filtr:facetList_rodzajRozstrzygniecia:' + str(diagnosisStageSel) +':facetCommand')))
if len(diagnosisStage.find_elements_by_tag_name("img")) == 0:
	diagnosisStage.click()

timeRange = wait.until(EC.visibility_of_element_located(
							 (By.ID,
							  'filtr:facetList_wyszukiwanieOkres:' + str(timeRangeSel) +':facetCommand')))
if len(timeRange.find_elements_by_tag_name("img")) == 0:
	timeRange.click()

pLogger('log.txt', True, '[DEBUG] Filtering done using options: Diagnosis stage: {0}, Time Range: {1}!'.format(diagnosisStageSelDict[diagnosisStageSel],
																							                   timeRangeSelDict[timeRangeSel]))
#####################################################
# MAIN FILES SCRAPING
#####################################################

# This chunk crawls over all listed jurisdictions to find
# links to separate pages and stores them in a dict. This
# chunk returns a list of dictionaries with keys:
#
# - id		  - id of the scraped object
# - sign	  - signature number of the jurisdiction
# - link	  - hyperlink to the particular jurisdiction
# - sep_opi	  - empty list (for now) for future separate opinions

# Getting maximum views per page (in our case it should be 500)
allViewOptions =  wait.until(EC.visibility_of_element_located(
							 (By.XPATH,
							  "//select[@name='wyszukiwanie:dataTable:rows']"))).find_elements_by_tag_name("option")

allViewOptions[-1].click() # assuming that the options are ordered, we choose the last one as it displays the most elements at once
maxPagesPerView = allViewOptions[-1].get_attribute("value")
pLogger('log.txt', True, '[DEBUG] Achieved maximum objects per page: ', maxPagesPerView)

# Main hyperlink scraping procedure. It crawls over each link returned by dataTable on
# Tribunal Page and keeps its name and link in a dictionary
while True:
	# hardcoded 0.5s so we are sure that the model showed up (and fortunately hidden)
	time.sleep(0.5)
	wait.until_not(EC.visibility_of_element_located((By.ID, "j_idt28_modal")))
	objectCount = pageCount*500 + 0 # single page iterator

	while True:
		name = 'wyszukiwanie:dataTable:' + str(objectCount) + ':dokument_:sprawa:sprawaLink' #each jurisdiction XPATH is build from constant parts and interable
		if len(driver.find_elements_by_xpath('//a[@id="' + name + '"]')) == 0: # we want to break if we find no correct elements on page
			break
		item = driver.find_element_by_xpath('//a[@id="' + name + '"]')
		sign_ = item.text # item's text's a signature
		link_ = item.get_attribute('href') # item's href's a link to a separate page
		if sign_ not in outputDict:
			outputDict[sign_] = [{'id':objectCount, 'link':link_, 'sep_opi':[]}]
		else:
			outputDict[sign_].append({'id':objectCount, 'link':link_})
		objectCount+=1


	nextPage = driver.find_element_by_class_name("ui-paginator-next") # page
	nextPageClasses = nextPage.get_attribute('class')
	pLogger('log.txt', True, "[DEBUG] Looping over page num:", str(pageCount + 1), ', scraped ', str(objectCount), ' objects')
	if 'ui-state-disabled' in nextPageClasses: # if button is not clickable it means we are at last page and can break
		break

	pageCount+=1
	nextPage.click()

# In some cases there are more than one jurisdiction with the same signature - we calculate only unique values
pLogger('log.txt', True, '[DEBUG] FINALIZED STEP 1/3, scraped {0} jurisdiction(s) in {1} object(s)'.format(len(outputDict), objectCount))

#####################################################
# SEPARATE OPINIONS FILES SCRAPING
#####################################################

# This method iterates over all jurisdiction objects and searches the signature name
# on the separate opinion's search page and then saves all the separate opinions that
# we found. In case of no separate opinions an empty list is attached to sep_opi field
# of jurisdiction's dictionary.

# Separate opinion's dictionary is as below:
# - link  - link to the separate opinion's website
# - by	  - name and surname of separate opinion's author

# Connection with the separate opinions webpage
pLogger('log.txt', True, '[DEBUG] Opening webpage: ', sepOpinionURL)
driver.get(sepOpinionURL)
pLogger('log.txt', True, '[DEBUG] Page loaded! ', sepOpinionURL)

while True:
	# hardcoded 0.5s so we are sure that the model showed up (and fortunately hidden)
	time.sleep(0.5)
	wait.until_not(EC.visibility_of_element_located((By.ID, "j_idt28_modal")))

	#getting table with all links to separate opinions
	table = driver.find_element_by_class_name('ui-datatable-tablewrapper')
	links = table.find_elements_by_tag_name('a')
	# but to find author we need to specify long xpath - many children to the closest tag.
	# Author is at 5th position in the separate opinion's object
	separateOpinionsAuthors = driver.find_elements_by_xpath("//table[@class='datalist2-noborder2']/tbody/tr[position()=5]/td/div/div/dl")
	for n, link in enumerate(links):
		if link.text in outputDict:
			link_ = link.get_attribute('href') #for each link we get a direct hyperlink
			allAuthors = [x.text for x in separateOpinionsAuthors[n].find_elements_by_tag_name('dt')] #we get all the authors of a single separate opinion
			outputDict[link.text][0]['sep_opi'].append({'link' : link_, 'by' :allAuthors})
			sepObjectCount+=1


	nextPage = driver.find_element_by_class_name("ui-paginator-next") #page'
	nextPageClasses = nextPage.get_attribute('class')
	pLogger('log.txt', True, "[DEBUG] Looping over page num:", str(sepPageCount + 1), ', scraped ', str(sepObjectCount), ' separate opinion(s)')
	if 'ui-state-disabled' in nextPageClasses: #if button is not clickable it means we are at last page and can break
		break

	sepPageCount+=1
	nextPage.click()

pLogger('log.txt', True, '[DEBUG] FINALIZED STEP 2/4, searched for all separate opinions')

# We are closing the driver as all other operations are don without its use
# driver.close()

#####################################################
# JUDGES ACTIVITY
#####################################################

# This method creates a Counter objects that counts occurences of the authors'
# surnames in the list of all separate opinions authors. Then we simply pick 5 judges
# (objects) from the top of the list.

fullSurnames = [x for key in outputDict for item in outputDict[key][0]['sep_opi'] for x in item['by']]
counterSurnames = collections.Counter(fullSurnames)
mostcommon5 = counterSurnames.most_common()[0:5]

print('\n')
pLogger('log.txt', False, '5 most common judges by separate opinions:\n')
for judge in mostcommon5:
	pLogger('log.txt', False, "{: >40} {: >5} separate opinion(s)".format(*judge))
print('\n')
pLogger('log.txt', True, '[DEBUG] FINALIZED STEP 3/4, found 5 most active judges')

#####################################################
# FILE OUTPUT
#####################################################

# While performing the file output we loop through outputDictL
# Previously we have put the appropriate case links in the sepOpi['link']
# Using those links we download both a .pdf and a .html file for each
# case and seperate opinion. The seperate opinions are found in a seperate
# directory. htmldownload is a seperate function that allows us to use a
# similar one-line to the pdf download, for enhanced code readability.

pLogger('log.txt', True, '[DEBUG] Starting download of the files')
# startedDownloadTime = timeit.default_timer()
# linksWithErrors=[]
objToDownload = sepObjectCount + objectCount
objDownloaded = 0

bar = IncrementalBar('Downloading files', max = objToDownload, suffix = '%(index)4d/%(max)4d | %(eta_td)s hours remaining')


for key in outputDict:
	obj = outputDict[key][0]
	#Every 25 downloaded cases we inform about our progress
	if objDownloaded % 25 == 0 and objDownloaded > 0 :
		# we are quitting driver to start a new session
		driver.quit()
		driver = webdriver.Firefox(firefox_options=options, executable_path = 'C:\Gecko\geckodriver.exe')

	#First we create the output directory
	outputDirectory = mainOutputDirectory + str(key).replace("/", "_").replace(" ", "_")
	if not os.path.exists(outputDirectory):
		os.makedirs(outputDirectory)
	separate_opinions = obj['sep_opi']

	#We will attempt to download the seperate opinions only if they exist
	if separate_opinions:
		tempDirectory = outputDirectory + "/" + "seperate_opinions"
		if not os.path.exists(tempDirectory):
			os.makedirs(tempDirectory)

		#Loop through all seperate opinions to download them
		for n, sepOpi in enumerate(separate_opinions):
			filename = (str(n) + str(obj['id']) + "_" + key + "_" + ''.join(sepOpi['by']).replace(' ','_')).replace("/", "_").replace(" ", "_")

			try:
				driverdownload(driver, tempDirectory + "/" + filename + ".html", sepOpi['link'])
			except:
				pLogger('log.txt',False, "[ERROR] File {0} could not be saved. Please download it manually via: {1}".format(filename,sepOpi['link']))
				continue

			if isPDFOutput:
				sys.stdout = open(os.devnull, 'w')
				pdfkit.from_file(sepOpi['link'], tempDirectory + "/" + filename + ".pdf", configuration = PDFconfig)
				sys.stdout = sys.__stdout__

			objDownloaded+=1
			bar.next()

	for case in outputDict[key]:

		#Finally download the case itself
		filename = (str(case['id'])+str(key)).replace("/", "_").replace(" ", "_")

		try:
			driverdownload(driver, outputDirectory + "/" + filename + ".html", case['link'])
		except:
			pLogger('log.txt',False, "[ERROR] File {0} could not be saved. Please download it manually via: {1}".format(filename,case['link']))
			continue

		if isPDFOutput:
			sys.stdout = open(os.devnull, 'w')
			pdfkit.from_html(case['link'], outputDirectory + "/" + filename + ".pdf", configuration = PDFconfig)
			sys.stdout = sys.__stdout__

		objDownloaded+=1
		bar.next()
bar.finish()

pLogger('log.txt', True, '[DEBUG] FINALIZED STEP 4/4, saved all files')

#####################################################
# FILE CHECK
#####################################################

files = glob.glob("output/*/*.html") + glob.glob("output/*/*/*.html")
fileCheck = 0
fileBad = 0

for file in files:
	if check(file) == True:
		pLogger('log.txt', True, '[DEBUG] {} is not downloaded correctly.'.format(file))
		fileBad+=1
	else:
		fileCheck+=1
pLogger('log.txt', True, '[DEBUG] {} files downloaded not correctly, {} files downloaded correctly.'.format(fileBad,fileCheck))