diff --git a/pipelines.py b/pipelines.py index 3b89206..4efa2db 100755 --- a/pipelines.py +++ b/pipelines.py @@ -5,7 +5,6 @@ import logging from Queue import Queue, Empty from collections import defaultdict, Counter -import pprint from hkjc.items import * from hkjc.models import * from scrapy.signalmanager import SignalManager @@ -20,10 +19,9 @@ from twisted.python.threadpool import ThreadPool from datetime import datetime, date import re -import scrapy import pprint from scrapy.contrib.pipeline.images import ImagesPipeline -from scrapy.exceptions import DropItem +import hashlib # Needed for multithreading, as I remember Session = scoped_session(sessionmaker(bind=engine)) @@ -201,12 +199,12 @@ def get_id(self, model, unique, fields): pp = pprint.PrettyPrinter(indent=4) # class NoInRaceImagePipeLine(ImagesPipeline): -# def set_filename(self, response): -# #add a regex here to check the title is valid for a filename. -# return 'full/{0}.jpg'.format(response.meta['Url'][0]) +# def set_filename(self, response): +# #add a regex here to check the title is valid for a filename. +# return 'full/{0}.jpg'.format(response.meta['Url'][0]) -# def get_media_requests(self, item, info): -# for image_url in item['image_urls']: +# def get_media_requests(self, item, info): +# for image_url in item['image_urls']: # # yield scrapy.Request(image_url, meta={'url': item['url']}) # yield scrapy.Request(image_url) @@ -221,48 +219,58 @@ def getLBW(lbw, place, LBWFirst): else: return lbw + +""" def getplace(place): # r_dh = r'.*[0-9].*DH$' if "DH" in place: return int(re.sub(r'[^\d.]', '', place)) # return int(placenum.replace("DH", '')) else: - return - { - "WV": 99, - "WV-A": 99, - "WX": 99, - "WX-A": 99, - "UV": 99, - "DISQ": 99, - "FE": 99, - "DNF":99, - "PU": 99, - "TNP":99, - "UR": 99, - }.get(str(place), int(place)) + return {"WV": 99, + "WV-A": 99, + "WX": 99, + "WX-A": 99, + "UV": 99, + "DISQ": 99, + "FE": 99, + "DNF": 99, + "PU": 99, + "TNP": 99, + "UR": 99}.get(str(place), int(place)) +""" + + +def getplace(place): + if place in ("WV", "WV-A", "WX", "WX-A", "UV", "DISQ", "FE", "DNF", "PU", "TNP", "UR"): + return 99 + else: + try: + return int(re.sub('\D', '', place)) + except ValueError: + return None def getnosectionals(distance): if distance is None or distance == 0: return 0 else: - return { - '1000': 3, - '1100': 3, - '1200': 3, - '1400': 4, - '1500': 4, - '1600': 4, - '1650': 4, - '1700': 5, - '1750': 5, - '1800': 5, - '1900': 5, - '2000': 6, - '2200': 6, - '2400': 6 - }.get(str(distance),0) + return {'1000': 3, + '1100': 3, + '1200': 3, + '1400': 4, + '1500': 4, + '1600': 4, + '1650': 4, + '1700': 5, + '1750': 5, + '1800': 5, + '1900': 5, + '2000': 6, + '2200': 6, + '2400': 6 + }.get(str(distance), 0) + def gethorseprize(placenum, prizemoney): # print (place, float(prizemoney)) @@ -270,16 +278,18 @@ def gethorseprize(placenum, prizemoney): return None # print (place, prizemoney) return { - '1': float(57.0*float(prizemoney))/100.0, - '2': float(22.0*float(prizemoney))/100.0, - '3': float(11.5*float(prizemoney))/100.0, - '4': float(6.0*float(prizemoney))/100.0, - '5': float(3.5*float(prizemoney))/100.0 + '1': float(57.0 * float(prizemoney)) / 100.0, + '2': float(22.0 * float(prizemoney)) / 100.0, + '3': float(11.5 * float(prizemoney)) / 100.0, + '4': float(6.0 * float(prizemoney)) / 100.0, + '5': float(3.5 * float(prizemoney)) / 100.0 }.get(str(placenum), 0.0) + + #inraceimage one per race # @dbdefer # class MyImagesPipeline(ImagesPipeline): - + # def file_path(self, request, response=None, info=None): # #item=request.meta['item'] # Like this you can use all from item, not just url. # #http://www.hkjc.com/english/racing/finishphoto.asp?racedate=20141220R1_L.jpg @@ -313,53 +323,48 @@ def gethorseprize(placenum, prizemoney): # return item class SQLAlchemyPipeline(object): - def __init__(self): - self.scheduler = DBScheduler() @inlineCallbacks - def process_item(self, item, spider): - + def process_item(self, item, spider): if isinstance(item, ResultsItem): - hkdividendid = self.scheduler.get_id( HKDividend, 'PublicRaceIndex', { "PublicRaceIndex": item["RacecourseCode"] + item["RaceDate"] + str(item["RaceNumber"]), - "RaceDate": item["RaceDate"], - "RaceNumber": item["RaceNumber"], - "RacecourseCode": item.get("RacecourseCode", None), - "PublicRaceIndex": item["RacecourseCode"] + - item["RaceDate"] + str(item["RaceNumber"]), - "WinDiv": item.get("WinDiv", None), - "Place1Div":item.get("Place1Div"), - "Place2Div": item.get("Place2Div", None), - "Place3Div":item.get("Place3Div", None), - "QNDiv": item.get("QNDiv", None), - "QP12Div": item.get("QP12Div"), - "QP13Div": item.get("QP13Div", None), - "QP23Div": item.get("QP23Div", None), - "TierceDiv": item.get("TierceDiv"), - "TrioDiv": item.get("TrioDiv", None), - "FirstfourDiv": item.get("FirstfourDiv", None), - "QuartetDiv": item.get("QuartetDiv", None), - "ThisDouble11Div": item.get("ThisDouble11Div", None), - "ThisDouble12Div": item.get("ThisDouble12Div",None), - "Treble111Div": item.get("Treble111Div", None), - "Treble112Div": item.get("Treble112Div",None), - "ThisDoubleTrioDiv": item.get("ThisDoubleTrioDiv", None), - "TripleTrio111Div": item.get("TripleTrio111Div", None), - "TripleTrio112Div": item.get("TripleTrio112Div", None), - "SixUpDiv": item.get("SixUpDiv", None), - "SixUpBonusDiv": item.get("SixUpBonusDiv", None) + "RaceDate": item["RaceDate"], + "RaceNumber": item["RaceNumber"], + "RacecourseCode": item.get("RacecourseCode", None), + "PublicRaceIndex": item["RacecourseCode"] + + item["RaceDate"] + str(item["RaceNumber"]), + "WinDiv": item.get("WinDiv", None), + "Place1Div": item.get("Place1Div"), + "Place2Div": item.get("Place2Div", None), + "Place3Div": item.get("Place3Div", None), + "QNDiv": item.get("QNDiv", None), + "QP12Div": item.get("QP12Div"), + "QP13Div": item.get("QP13Div", None), + "QP23Div": item.get("QP23Div", None), + "TierceDiv": item.get("TierceDiv"), + "TrioDiv": item.get("TrioDiv", None), + "FirstfourDiv": item.get("FirstfourDiv", None), + "QuartetDiv": item.get("QuartetDiv", None), + "ThisDouble11Div": item.get("ThisDouble11Div", None), + "ThisDouble12Div": item.get("ThisDouble12Div", None), + "Treble111Div": item.get("Treble111Div", None), + "Treble112Div": item.get("Treble112Div", None), + "ThisDoubleTrioDiv": item.get("ThisDoubleTrioDiv", None), + "TripleTrio111Div": item.get("TripleTrio111Div", None), + "TripleTrio112Div": item.get("TripleTrio112Div", None), + "SixUpDiv": item.get("SixUpDiv", None), + "SixUpBonusDiv": item.get("SixUpBonusDiv", None) }) - raceclassid = self.scheduler.get_id( - Raceclass, "Name", + Raceclass, "Name", { - "Name": item.get("Raceclass", None) + "Name": item.get("Raceclass", None) }) railtypeid = self.scheduler.get_id( @@ -367,22 +372,22 @@ def process_item(self, item, spider): { "Name": item.get("Railtype", None) }) - + goingid = self.scheduler.get_id( Going, "Name", { "Name": item.get("Going", None) - }) + }) distanceid = self.scheduler.get_id( Distance, "MetricName", { - "MetricName": int(item.get("Distance", 0)), - "Miles": float(float(item.get("Distance", 0))/1600.0), - "Furlongs": int(int(item.get("Distance", 0))/200) - }) - + "MetricName": int(item.get("Distance", 0)), + "Miles": float(float(item.get("Distance", 0)) / 1600.0), + "Furlongs": int(int(item.get("Distance", 0)) / 200) + }) + # gearid = self.scheduler.get_id( # Gear, "name", # { @@ -395,7 +400,7 @@ def process_item(self, item, spider): "Name": item["Trainer"], "Homecountry": "HKG" - }) + }) jockeyid = self.scheduler.get_id( Jockey, "Name", @@ -405,13 +410,12 @@ def process_item(self, item, spider): "Homecountry": "HKG" }) - - horseid=self.scheduler.get_id( + horseid = self.scheduler.get_id( Horse, "Code", { - "Code": item["HorseCode"], - "Name": item["Horse"], - "Homecountry": "HKG" + "Code": item["HorseCode"], + "Name": item["Horse"], + "Homecountry": "HKG" }) @@ -424,84 +428,87 @@ def process_item(self, item, spider): raceid = self.scheduler.get_id( HKRace, "PublicRaceIndex", { - "Url": item.get("Url", None), - "RacecourseCode": item["RacecourseCode"], - "RaceDate": item["RaceDate"], - "Name": item["Name"], - # "Inraceimage": item["images"], - # "Inraceimage": item["images"][0]['data'], - "Inraceimage": item["images"][0]['data'] if item["images"] else None, - "RaceNumber": int(item["RaceNumber"]), - "PublicRaceIndex": item["RacecourseCode"] + - item["RaceDate"] + str(item["RaceNumber"]), - "IncidentReport": item.get("IncidentReport", None), - "RaceIndex": item.get("RaceIndex", None), - "Prizemoney": item.get("Prizemoney", None), - "Raceratingspan": item.get("Raceratingspan", None), - "Surface": item.get("Surface", None), - "Dayofweek": item.get("Dayofweek", None), - "NoSectionals": getnosectionals(item.get("Distance", 0)), - "hk_going_id": goingid, - "hk_raceclass_id": raceclassid, - "hk_railtype_id": railtypeid, - "hk_distance_id": distanceid, - "hk_dividend_id": hkdividendid + "Url": item.get("Url", None), + "RacecourseCode": item["RacecourseCode"], + "RaceDate": item["RaceDate"], + "Name": item["Name"], + # "Inraceimage": item["images"], + # "Inraceimage": item["images"][0]['data'], + "Inraceimage": item["images"][0]['data'] if item["images"] else None, + "RaceNumber": int(item["RaceNumber"]), + "PublicRaceIndex": item["RacecourseCode"] + + item["RaceDate"] + str(item["RaceNumber"]), + "IncidentReport": item.get("IncidentReport", None), + "RaceIndex": item.get("RaceIndex", None), + "Prizemoney": item.get("Prizemoney", None), + "Raceratingspan": item.get("Raceratingspan", None), + "Surface": item.get("Surface", None), + "Dayofweek": item.get("Dayofweek", None), + "NoSectionals": getnosectionals(item.get("Distance", 0)), + "hk_going_id": goingid, + "hk_raceclass_id": raceclassid, + "hk_railtype_id": railtypeid, + "hk_distance_id": distanceid, + "hk_dividend_id": hkdividendid }) # ownerid = yield ownerid - + jockeyid = yield jockeyid horseid = yield horseid raceid = yield raceid - trainerid = yield trainerid - - - runner = HKRunner( - HorseNumber=item.get("HorseNumber", def_int), - Jockey=item["Jockey"], - Trainer=item["Trainer"], - ActualWt=item["ActualWt"], - DeclarHorseWt=item["DeclarHorseWt"], - Draw=item.get("Draw", None), - LBW = item.get("LBW", None), - isScratched = item.get("isScratched", None), - # LBW= getLBW(item.get("LBW", None),item.get("Place", None), item.get("LBWFirst", None)), - RunningPosition=item.get("RunningPosition", None), - Sec1DBL=item.get("Sec1DBL", None), - Sec2DBL=item.get("Sec2DBL", def_DBL), - Sec3DBL=item.get("Sec3DBL", def_DBL), - Sec4DBL=item.get("Sec4DBL", def_DBL), - Sec5DBL=item.get("Sec5DBL", def_DBL), - Sec6DBL=item.get("Sec6DBL", def_DBL), - FinishTime=item.get("FinishTime", def_time), - Sec1Time=item.get("Sec1time", def_time), - Sec2Time=item.get("Sec2time", def_time), - Sec3Time=item.get("Sec3time", def_time), - Sec4Time=item.get("Sec4time", def_time), - Sec5Time=item.get("Sec5time", def_time), - Sec6Time=item.get("Sec6time", def_time), - WinOdds=item.get("Winodds", None), - HorseReport=item.get("HorseReport", None), - PlaceNum = getplace(item.get("Place", None)), - Place = item.get("Place", None), - Horseprize = gethorseprize(item.get("PlaceNum", None), item.get("Prizemoney", None)), - PublicRaceIndex = item["RacecourseCode"] + item["RaceDate"] + str(item["RaceNumber"]) + item["Horse"], - hk_race_id=raceid, - jockey_id= jockeyid, - trainer_id=trainerid, - horse_id=horseid) - - self.scheduler.save(runner) - + trainerid = yield trainerid + + self.scheduler.get_id(HKRunner, + "PublicRaceIndex", + dict(HorseNumber=item.get("HorseNumber", def_int), + Jockey=item["Jockey"], + Trainer=item["Trainer"], + ActualWt=item["ActualWt"], + DeclarHorseWt=item["DeclarHorseWt"], + Draw=item.get("Draw", None), + LBW=item.get("LBW", None), + isScratched=item.get("isScratched", None), + # LBW= getLBW(item.get("LBW", None),item.get("Place", None), item.get("LBWFirst", None)), + RunningPosition=item.get("RunningPosition", None), + Sec1DBL=item.get("Sec1DBL", None), + Sec2DBL=item.get("Sec2DBL", def_DBL), + Sec3DBL=item.get("Sec3DBL", def_DBL), + Sec4DBL=item.get("Sec4DBL", def_DBL), + Sec5DBL=item.get("Sec5DBL", def_DBL), + Sec6DBL=item.get("Sec6DBL", def_DBL), + FinishTime=item.get("FinishTime", def_time), + Sec1Time=item.get("Sec1time", def_time), + Sec2Time=item.get("Sec2time", def_time), + Sec3Time=item.get("Sec3time", def_time), + Sec4Time=item.get("Sec4time", def_time), + Sec5Time=item.get("Sec5time", def_time), + Sec6Time=item.get("Sec6time", def_time), + WinOdds=item.get("Winodds", None), + HorseReport=item.get("HorseReport", None), + PlaceNum=getplace(item.get("Place", None)), + Place=item.get("Place", None), + Horseprize=gethorseprize(item.get("PlaceNum", None), + item.get("Prizemoney", + None)), + PublicRaceIndex=item["RacecourseCode"] + item[ + "RaceDate"] + str(item["RaceNumber"]) + + item["Horse"], + hk_race_id=raceid, + jockey_id=jockeyid, + trainer_id=trainerid, + horse_id=horseid)) returnValue(item) + ''' usage instructions: ''' + class ByteStorePipeline(ImagesPipeline): - def media_downloaded(self, response, request, info): + def media_downloaded(self, response, request, info): referer = request.headers.get('Referer') if response.status != 200: