-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmonster.py
More file actions
32 lines (28 loc) · 872 Bytes
/
monster.py
File metadata and controls
32 lines (28 loc) · 872 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re
import os
import requests
from bs4 import BeautifulSoup
import unicodedata
def main():
text = get_text(open_url(input(str())))
with open('monster_texts.txt', 'a') as file:
file.write(text + '\n')
def open_url(url):
headers = {
'user-agent':
'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:38.0)'
' Gecko/20100101 Firefox/38.0',
}
s = requests.Session()
response = s.get(url, headers=headers, timeout=20)
html = str(response.text)
return html
def get_text(html):
description = re.search(
r'JobDescription[\s\S]*?>([\s\S]*?)<footer', html)
description = description.group(1) if description else ""
description = BeautifulSoup(description, "lxml").text
clean_text = unicodedata.normalize("NFKD",description)
return clean_text
if __name__ == "__main__":
main()