comics/comicScraper.py at master · JakeDame/comics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#imports
import requests, urllib.request, os, datetime
from isoweek import Week
from bs4 import BeautifulSoup
from pymongo import MongoClient

uri = "mongodb://admin:admin123@localhost:27017/comics?authSource=admin"
client = MongoClient(uri)
db = client['comics']
books = db['comicBooks']

#create dictionary to store data to send to the database
comicDict = {}

site = "http://comics.gocollect.com/new/this/week"
page = requests.get(site)
soup = BeautifulSoup(page.content, 'html.parser')

#Create publisherUrl array, counter to ensure only the top 10 publishers are grabbed, and set fillList flag to False
publisherUrl = []
fillList = True
counter = 0

#find the publishers to fill publisherUrl with
findPub = soup.find_all('a', {'class' : 'new_publisher'});
for item in findPub:
  if(counter == 10): #only want the top 10 publishers
    fillList = False
  if(fillList == True):
    publisherUrl.append(item['href'])
    counter += 1

#Get date to fill releaseDate parameter in dictionary
now = datetime.datetime.now()
weekNum = datetime.date(now.year, now.month, now.day).strftime("%V")
weekNum = int(weekNum)
releaseDate = Week(now.year, weekNum).wednesday()
releaseDate = str(releaseDate)
comicDict['ReleaseDate'] = releaseDate

#Start of Publishers for loop
for pub in publisherUrl:
  #Access the new website
  newSite = pub
  newPage = requests.get(newSite)
  newSoup = BeautifulSoup(newPage.content, 'html.parser')

  #Start to get data to fill dictionary with
  pubName = pub.split('/', 6)[6]
  folderName = pubName
  if(pubName.find('-') != -1):
    pubName = pubName.split('-')
    pubName = ' '.join(pubName)
    pubName = pubName.title()
  elif((pubName.find('dc') != -1) or (pubName.find('idw') != -1)):
    pubName = pubName.upper()
  else:
    pubName = pubName.title()

  comicDict['Publisher'] = pubName
  comicDict['Folder'] = folderName


  #Get list of comics displayed on newSite
  comicsList= newSoup.find_all("li", {"class": "comic"})

  for item in comicsList:
    title = item.strong.get_text()
    comicDict['Title'] = title
    imgName = title.split('#')
    imgName = ''.join(imgName)
    imgName = imgName + '.png'
    imgName = imgName.split(' ')
    imgName = '_'.join(imgName)
    imgName = imgName.lower()
    comicDict['Cover'] = imgName

    dirName = os.path.dirname(__file__)
    dirPath = os.path.join(dirName, "app/public/images/covers")
    dirPathFinal = os.path.join(dirPath, pub.split('/', 6)[6])
    fullName = os.path.join(dirPathFinal, imgName)
    imgUrl = item.img['src']

    #Dont want to download duplicate covers or add duplicate documents
    if(books.find({"Cover": imgName}).count() < 1):
      #if there's no image from the website
      if(imgUrl.find('no-item-image') == -1):
        urllib.request.urlretrieve(imgUrl, fullName)
      books.insert(comicDict.copy())

#End loop