-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrae-scraper.py
More file actions
103 lines (80 loc) · 2.77 KB
/
rae-scraper.py
File metadata and controls
103 lines (80 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3
#Created by PepeBigotes
import os
def try_input(msg) -> str:
try: x = input(msg)
except KeyboardInterrupt: print("\nKeyboardInterrupt"); exit()
return x
try:
try: from selenium import webdriver
except ImportError:
print("[!] Module 'selenium' in not installed")
try_input(" Press ENTER to install it (or CTRL+C to exit)")
os.system('pip3 install selenium')
try: from selenium import webdriver
except ImportError: print("\n[!] Selenium couldn't be installed"); exit(1)
try_input("\n Selenium installed, press ENTER to continue")
except KeyboardInterrupt: print("\nKeyboardInterrupt"); exit()
import sys
import requests
from time import sleep
from bs4 import BeautifulSoup
# CONSTS / CONFIG
RAE_DOMAIN = "dle.rae.es"
BSOUP_PARSER = "html.parser"
VALID_CHARS = "qweértyuúiíoópaásdfghjklñzxcvbnm-"
# VERIFY INPUT
try: INPUT = sys.argv[1]
except IndexError:
print("[!] You need to put the word you wanna look for:")
print(" python3 raescraper.py <YOUR WORD HERE>")
exit(1)
for char in INPUT:
if not char in VALID_CHARS:
print(f"[!] Invalid input: {INPUT} ({char})")
exit(1)
URL = "https://"+RAE_DOMAIN+"/"+INPUT
# CHECK CONNECTION
try:
x = requests.get(URL, timeout=5)
except requests.ConnectionError:
print(f"[!] Cannot reach {URL}\n Check your internet connection and try again")
exit(1)
# GET CONTENT
print(f"[*] Getting the contents of {URL} ...",end='\r')
driver = webdriver.Firefox()
driver.get(URL)
sleep(1)
CONTENT = driver.page_source
print(f"[+] Got the contents of {URL} ")
driver.quit()
sleep(.5)
# SCRAP CONTENT
soup = BeautifulSoup(CONTENT, BSOUP_PARSER)
#print(soup.prettify())
results = soup.find('div', attrs={'id':'resultados'})
if "no está en el Diccionario" in results.get_text():
notfound = f"La palabra {INPUT} no está en el Diccionario."
related = "Las entradas que se muestran a continuación podrían estar relacionadas:"
itemlist = results.findAll('div', attrs={'class':'n1'})
os.system('cls' if os.name=='nt' else 'clear')
print('\n' + notfound)
if len(itemlist) > 0:
print('\n' + related)
for i in itemlist: print(" " + i.get_text())
exit()
definitions = results.find_all(attrs={'class':'j'})
synant = results.find(attrs={'class':'div-sin-ant'})
try: synonyms = synant.findChildren("ul", recursive=False)[0]
except IndexError: synonyms = []
try: antonyms = synant.findChildren("ul", recursive=False)[1]
except IndexError: antonyms = []
# PRINTS
os.system('cls' if os.name=='nt' else 'clear')
print('\n' + INPUT + '\n')
print("DEFINITIONS:")
for i in definitions: print(" " + i.get_text())
print("SYNONYMS:")
for i in synonyms: print(" " + i.get_text())
print("ANTONYMS:")
for i in antonyms: print(" " + i.get_text())