-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
58 lines (42 loc) · 1.65 KB
/
scraper.py
File metadata and controls
58 lines (42 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#Scraper for a dynamic site, seekingalpha.com
#Scrape stock's news headlines and their dates
#Imports
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
#Download and direct chrome driver
PATH = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" #Path to chromedriver
driver = webdriver.Chrome(PATH)
#Ticker for stock news to scrape
ticker = "FB"
date_xpath = "__7a760-bAtK_ __7a760-uJXfc" #Check xpath from url, as they change when there's sit update
headline_xpath = "__7a760-3a6i8"
#Function for scraping dynamic site
def load_page(ticker, date_xpath, headline_xpath):
df = pd.DataFrame(columns=['Date', 'Headlines'])
dates = []
headlines = []
url = f"https://seekingalpha.com/symbol/{ticker}/news?filter="
driver.get(url)
print(url)
time.sleep(2)
#Loop for loading dynamic site to the end of page
for i in range(600):
body = driver.find_element_by_xpath('//body')
body.send_keys(Keys.END)
time.sleep(.5)
#Find xpaths and append them to dataframe
date = driver.find_elements_by_xpath(f'//span[@class="{date_xpath}"]')
headline = driver.find_elements_by_xpath(f'//a[@class="{headline_xpath}"]')
elements_in_page = len(headline)
for element in range(elements_in_page):
headlines.append(headline[element].text)
dates.append(date[element].text)
df["Date"] = dates
df["Headlines"] = headlines
df.drop_duplicates()
print(len(df))
df.to_csv(f"{ticker}_headlines.csv", index=False)
#Execute and function
load_page(ticker=ticker, headline_xpath=headline_xpath, date_xpath=date_xpath)