-
Notifications
You must be signed in to change notification settings - Fork 75
Added Behdad Analui's solution #63
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
e80871e
f332112
581a83a
76907a2
e5c117d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| .bundle |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| source 'https://rubygems.org' | ||
| gem 'mechanize', '~>2.7.2', '<2.7.3' | ||
| gem 'pry' # because you'll probably need it |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| GEM | ||
| remote: https://rubygems.org/ | ||
| specs: | ||
| coderay (1.1.1) | ||
| domain_name (0.5.20161129) | ||
| unf (>= 0.0.5, < 1.0.0) | ||
| http-cookie (1.0.3) | ||
| domain_name (~> 0.5) | ||
| mechanize (2.7.2) | ||
| domain_name (~> 0.5, >= 0.5.1) | ||
| http-cookie (~> 1.0.0) | ||
| mime-types (~> 1.17, >= 1.17.2) | ||
| net-http-digest_auth (~> 1.1, >= 1.1.1) | ||
| net-http-persistent (~> 2.5, >= 2.5.2) | ||
| nokogiri (~> 1.4) | ||
| ntlm-http (~> 0.1, >= 0.1.1) | ||
| webrobots (>= 0.0.9, < 0.2) | ||
| method_source (0.8.2) | ||
| mime-types (1.25.1) | ||
| mini_portile2 (2.1.0) | ||
| net-http-digest_auth (1.4) | ||
| net-http-persistent (2.9.4) | ||
| nokogiri (1.7.0.1) | ||
| mini_portile2 (~> 2.1.0) | ||
| ntlm-http (0.1.1) | ||
| pry (0.10.4) | ||
| coderay (~> 1.1.0) | ||
| method_source (~> 0.8.1) | ||
| slop (~> 3.4) | ||
| slop (3.6.0) | ||
| unf (0.1.4) | ||
| unf_ext | ||
| unf_ext (0.0.7.2) | ||
| webrobots (0.1.2) | ||
|
|
||
| PLATFORMS | ||
| ruby | ||
|
|
||
| DEPENDENCIES | ||
| mechanize (~> 2.7.2, < 2.7.3) | ||
| pry | ||
|
|
||
| BUNDLED WITH | ||
| 1.13.6 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,58 @@ | ||
| # assignment_web_scraper | ||
| Bottom feeding your way across the web | ||
|
|
||
| Author : Behdad Analui | ||
|
|
||
| [A Ruby-based web scraping and parsing project which uses NokoGiri, Mechanize, Open-uri. From the Viking Code School.](http://www.vikingcodeschool.com) | ||
|
|
||
|
|
||
|
|
||
| ---> HTML Params for different search results | ||
|
|
||
|
|
||
| # Searching : "Web Developer" in "Los Angeles, CA" | ||
|
|
||
| # page 1 : | ||
| https://www.dice.com/jobs?q=Web+Developer&l=Los+Angeles%2C+CA&searchid=4695930607292 | ||
|
|
||
| This result is showing with already filtered: Distance = "Within 30 miles" | ||
|
|
||
| # page 2 : | ||
| https://www.dice.com/jobs/q-Web_Developer-limit-30-l-Los_Angeles%2C_CA-radius-30-startPage-2-limit-30-jobs?searchid=3243616771397 | ||
|
|
||
| # Add filter --> Company Segment = "Recruiter" | ||
| https://www.dice.com/jobs/q-Web_Developer-l-Los_Angeles%2C_CA-dcs-Recruiter-radius-30-jobs.html?searchid=3658329038016 | ||
|
|
||
| # Add filter --> Distance = "Within 10 miles" | ||
| https://www.dice.com/jobs/q-Web_Developer-limit-30-l-Los_Angeles%2C_CA-radius-10-jobs.html?searchid=9229563461169 | ||
|
|
||
| # Add filter --> Distance = "Exact Location" | ||
| https://www.dice.com/jobs/q-Web_Developer-limit-30-l-Los_Angeles%2C_CA-radius-El-jobs.html?searchid=2207553450629 | ||
|
|
||
| # Add filter --> Title = "Software Engineer" | ||
| https://www.dice.com/jobs/q-Web_Developer-l-Los_Angeles%2C_CA-djt-Software_Engineer-radius-El-jobs.html?searchid=1760823245268 | ||
|
|
||
| # Add filter --> Company = "Praedicat, Inc." | ||
| https://www.dice.com/jobs/q-Web_Developer-dc-Praedicat%2C+Inc.-limit-30-l-Los_Angeles%2C_CA-radius-El-jobs.html?searchid=5889586703863 | ||
|
|
||
| # Add filter --> Employment Type = "Part-Time" | ||
| https://www.dice.com/jobs/jtype-Part%20Time-q-Web_Developer-limit-30-l-Los_Angeles%2C_CA-radius-El-jobs.html?searchid=8622557635651 | ||
|
|
||
| # Add filter --> Telecommute = "Yes" | ||
| https://www.dice.com/jobs/q-Web_Developer-l-Los_Angeles%2C_CA-dtco-true-radius-El-jobs.html?searchid=2773396522422 | ||
|
|
||
| # Add bunch of filters | ||
|
|
||
| Within 10 miles | ||
| Java Developer | ||
| Los Angeles, CA | ||
| Rose IT Corp. | ||
| Full-Time and Part-Time and Contracts(Contract Independent, C2H W2) | ||
| Yes | ||
|
|
||
| https://www.dice.com/jobs/q-Web_Developer-djt-Java+developer-dc-Rose+IT+Corp.-jtype-Full+Time+OR+Part+Time+OR+Contract+Independent+OR+C2H+W2-dtco-true-limit-30-l-Los_Angeles%2C_CA-radius-10-jobs.html?searchid=4066268815121 | ||
|
|
||
|
|
||
| ---> HTML Params for a single job page with title 'Web Application Firewall Engineer - remote' | ||
|
|
||
| https://www.dice.com/jobs/detail/Web-Application-Firewall-Engineer-%26%2345-remote-CGS-Los-Angeles-CA-90017/10477632/BHJOB31_326?icid=sr1-1p&q=Web%20Developer&l=Los%20Angeles,%20CA |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
|
|
||
| # TODO : Major change should be done to support user interactions, currently only one filter which is kind of fixed, should add all | ||
| # different filters based on the segment filter | ||
|
|
||
| Job = Struct.new(:title, :company_name, :link_on_dice, :location, :posting_date, :company_id, :id) | ||
|
|
||
| require_relative 'page_formatter' | ||
| require_relative 'scrape_dice' | ||
| require_relative 'result_saver' | ||
| require_relative 'search_criteria' | ||
|
|
||
| class DiceWebScraper | ||
|
|
||
| def initialize | ||
| @all_jobs = [] | ||
| end | ||
|
|
||
| def extract_job_info(search_result) | ||
| jobs = Nokogiri::HTML(search_result.body).css('.complete-serp-result-div') | ||
| all_jobs = [] | ||
| jobs.each do |job| | ||
| title = ScrapeDice.extract_title job | ||
| company_name = ScrapeDice.extract_company_name job | ||
| location = ScrapeDice.extract_location job | ||
| posting_date = ScrapeDice.extract_posting_date job | ||
| link_on_dice = ScrapeDice.extract_link job | ||
| full_company_link = job.css('ul')[1].children[1].children[2].children[0].attribute('href').value | ||
| company_id = full_company_link[29..full_company_link.length-1] | ||
| regex = /#{company_id}\/(.*)\?/ | ||
| full_job_link = job.css('h3 a').attribute('href').value | ||
| job_id = full_job_link.match(regex)[1] | ||
| @all_jobs << Job.new(title, company_name, link_on_dice, location, posting_date, company_id, job_id) | ||
| end | ||
| end | ||
|
|
||
| def scrape | ||
| #criteria = SearchCriteria.new("Web Developer", "Los Angeles, CA", 30, nil, nil, nil, 1, nil) | ||
| dice_search_page = "https://www.dice.com/jobs/q-Web_Developer-limit-30-l-Los_Angeles%2C_CA-jobs.html?" | ||
| search_result = PageFormatter.get_page_from(dice_search_page) | ||
| cur_page = filter_result search_result | ||
| extract_job_info cur_page | ||
| end | ||
|
|
||
| def filter_result(search_result) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method can be removed since all it does is call another method. You may as well call |
||
| filter_segment(search_result, "DirectHire") | ||
| end | ||
|
|
||
| def filter_segment(search_result, criteria) | ||
| filter_wrapper = Nokogiri::HTML(search_result.body).css('#companySegments input') | ||
| segment_filters = {} | ||
| filter_wrapper.each do |node| | ||
| segment_filters[node.parent.children[3].children[0].text] = node.parent.children[3].attribute('href').value | ||
| end | ||
| PageFormatter.get_page_from(segment_filters[criteria]) | ||
| end | ||
|
|
||
| def render | ||
| ResultSaver.render(@all_jobs) | ||
| end | ||
|
|
||
| def save | ||
| ResultSaver.save(@all_jobs) | ||
| end | ||
| end | ||
|
|
||
| scraper = DiceWebScraper.new | ||
| scraper.scrape | ||
| #scraper.render | ||
| scraper.save | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| Title,Company Name,Link,Location,Posted Date,Company ID,Job ID | ||
| Senior Web Developer,"CoStar Realty Information, Inc","https://www.dice.com/jobs/detail/Senior-Web-Developer-CoStar-Realty-Information%2C-Inc-Surfside-CA-90743/10100023/R11421?icid=sr1-1p&q=Web Developer&l=Los Angeles, CA","Surfside, CA","Fri, 13 Jan 2017",10100023,R11421 | ||
| Front End Web Developer,Stamps.com,"https://www.dice.com/jobs/detail/Front-End-Web-Developer-Stamps.com-El-Segundo-CA-90245/RTX15d6ef/721684?icid=sr2-1p&q=Web Developer&l=Los Angeles, CA","El Segundo, CA","Fri, 06 Jan 2017",RTX15d6ef,721684 | ||
| Front End Web Developer,TollFreeForwarding.com,"https://www.dice.com/jobs/detail/Front-End-Web-Developer-TollFreeForwarding.com-Los-Angeles-CA-90045/RTX11a93a/528171?icid=sr3-1p&q=Web Developer&l=Los Angeles, CA","Los Angeles, CA","Fri, 06 Jan 2017",RTX11a93a,528171 | ||
| Java Developer,The TEAM Companies,"https://www.dice.com/jobs/detail/Java-Developer-The-TEAM-Companies-Burbank-CA-91506/RTX1639b7/814271?icid=sr4-1p&q=Web Developer&l=Los Angeles, CA","Burbank, CA","Wed, 11 Jan 2017",RTX1639b7,814271 | ||
| Software Engineer - PHP Web Services,"Age of Learning, Inc.","https://www.dice.com/jobs/detail/Software-Engineer-%26%2345-PHP-Web-Services-Age-of-Learning%2C-Inc.-Glendale-CA-91203/RTX1798fb/665394?icid=sr5-1p&q=Web Developer&l=Los Angeles, CA","Glendale, CA","Fri, 30 Dec 2016",RTX1798fb,665394 | ||
| Web Designer,The Creative Group,"https://www.dice.com/jobs/detail/Web-Designer-The-Creative-Group-Los-Angeles-CA-90024/10466845/00320-9500327280?icid=sr6-1p&q=Web Developer&l=Los Angeles, CA","Los Angeles, CA","Fri, 13 Jan 2017",10466845,00320-9500327280 | ||
| "Web Application Security Manager, Vice President",MUFG,"https://www.dice.com/jobs/detail/Web-Application-Security-Manager%2C-Vice-President-MUFG-Monterey-Park-CA-91754/uboc/10012023-WD?icid=sr7-1p&q=Web Developer&l=Los Angeles, CA","Monterey Park, CA","Fri, 13 Jan 2017",uboc,10012023-WD | ||
| Web Accessibility/ADA Testing,Impiger Mobile Inc,"https://www.dice.com/jobs/detail/Web-Accessibility%26%2347ADA-Testing-Impiger-Mobile-Inc-Glendale-CA-91203/RTX1b76d0/811664?icid=sr8-1p&q=Web Developer&l=Los Angeles, CA","Glendale, CA","Wed, 11 Jan 2017",RTX1b76d0,811664 | ||
| Business Applications Programmer 3,NORTHROP GRUMMAN,"https://www.dice.com/jobs/detail/Business-Applications-Programmer-3-NORTHROP-GRUMMAN-Redondo-Beach-CA-90277/ngitbot/16030204?icid=sr9-1p&q=Web Developer&l=Los Angeles, CA","Redondo Beach, CA","Thu, 12 Jan 2017",ngitbot,16030204 | ||
| Senior Front End Developer (javascript),"CoStar Realty Information, Inc","https://www.dice.com/jobs/detail/Senior-Front-End-Developer-%28javascript%29-CoStar-Realty-Information%2C-Inc-Los-Angeles-CA-90001/10100023/R10965?icid=sr10-1p&q=Web Developer&l=Los Angeles, CA","Los Angeles, CA","Fri, 13 Jan 2017",10100023,R10965 | ||
| Sr. Software Engineer C,"Cast & Crew Entertainment Services, Inc.","https://www.dice.com/jobs/detail/Sr.-Software-Engineer-C-Cast-%26-Crew-Entertainment-Services%2C-Inc.-Burbank-CA-91504/RTL99102/783532?icid=sr11-1p&q=Web Developer&l=Los Angeles, CA","Burbank, CA","Sun, 08 Jan 2017",RTL99102,783532 | ||
| Senior Javascript Engineer,"CoStar Realty Information, Inc","https://www.dice.com/jobs/detail/Senior-Javascript-Engineer-CoStar-Realty-Information%2C-Inc-Los-Angeles-CA-90001/10100023/R11022?icid=sr12-1p&q=Web Developer&l=Los Angeles, CA","Los Angeles, CA","Fri, 13 Jan 2017",10100023,R11022 | ||
| Senior Cloud Infrastructure Engineer (AWS),Warner Bros. Entertainment,"https://www.dice.com/jobs/detail/Senior-Cloud-Infrastructure-Engineer-%28AWS%29-Warner-Bros.-Entertainment-Burbank-CA-91501/WBCA/153437BR?icid=sr13-1p&q=Web Developer&l=Los Angeles, CA","Burbank, CA","Fri, 13 Jan 2017",WBCA,153437BR | ||
| Director - Engineering,Ticketmaster,"https://www.dice.com/jobs/detail/Director-%26%2345-Engineering-Ticketmaster-Hollywood-CA-90028/tmaster/JR-02800?icid=sr14-1p&q=Web Developer&l=Los Angeles, CA","Hollywood, CA","Thu, 12 Jan 2017",tmaster,JR-02800 | ||
| Engineer III - Software,Ticketmaster,"https://www.dice.com/jobs/detail/Engineer-III-%26%2345-Software-Ticketmaster-Hollywood-CA-90028/tmaster/JR-02954?icid=sr15-1p&q=Web Developer&l=Los Angeles, CA","Hollywood, CA","Thu, 12 Jan 2017",tmaster,JR-02954 | ||
| Sr. Network Engineer,MagTek,"https://www.dice.com/jobs/detail/Sr.-Network-Engineer-MagTek-Seal-Beach-CA-90740/10217156/810414?icid=sr16-1p&q=Web Developer&l=Los Angeles, CA","Seal Beach, CA","Fri, 13 Jan 2017",10217156,810414 | ||
| Quality Assurance (QA) Analyst,Rand,"https://www.dice.com/jobs/detail/Quality-Assurance-%28QA%29-Analyst-Rand-Santa-Monica-CA-90407/RTX14de8b/4593?icid=sr17-1p&q=Web Developer&l=Los Angeles, CA","Santa Monica, CA","Fri, 23 Dec 2016",RTX14de8b,4593 | ||
| Network Engineer,Access Networks,"https://www.dice.com/jobs/detail/Network-Engineer-Access-Networks-Valencia-CA-91355/RTX16e807/4682102?icid=sr18-1p&q=Web Developer&l=Los Angeles, CA","Valencia, CA","Tue, 10 Jan 2017",RTX16e807,4682102 | ||
| Senior Systems Engineer,Ciber,"https://www.dice.com/jobs/detail/Senior-Systems-Engineer-Ciber-Burbank-CA-91501/10122753/94967?icid=sr19-1p&q=Web Developer&l=Los Angeles, CA","Burbank, CA","Fri, 13 Jan 2017",10122753,94967 | ||
| Sr. Software Engineer - Digital Cinema,"Fox (Film, TV & Sports)","https://www.dice.com/jobs/detail/Sr.-Software-Engineer-%26%2345-Digital-Cinema-Fox-%28Film%2C-TV-%26-Sports%29-Los-Angeles-Metro-Area-CA-90013/fox/FFE0004389-553?icid=sr20-1p&q=Web Developer&l=Los Angeles, CA","Los Angeles Metro Area, CA","Fri, 13 Jan 2017",fox,FFE0004389-553 | ||
| Full Stack Developer,Media Services,"https://www.dice.com/jobs/detail/Full-Stack-Developer-Media-Services-Los-Angeles-CA-90049/RTX14b198/4623689?icid=sr21-1p&q=Web Developer&l=Los Angeles, CA","Los Angeles, CA","Thu, 12 Jan 2017",RTX14b198,4623689 | ||
| UI Developer,comScore,"https://www.dice.com/jobs/detail/UI-Developer-comScore-Woodland-Hills-CA-91302/10117623/2292?icid=sr22-1p&q=Web Developer&l=Los Angeles, CA","Woodland Hills, CA","Fri, 13 Jan 2017",10117623,2292 | ||
| Software Engineer (Java),"Verifi, Inc","https://www.dice.com/jobs/detail/Software-Engineer-%28Java%29-Verifi%2C-Inc-Los-Angeles-CA-90001/RTL245792/70b8b0c1c2599867cb83?icid=sr23-1p&q=Web Developer&l=Los Angeles, CA","Los Angeles, CA","Fri, 13 Jan 2017",RTL245792,70b8b0c1c2599867cb83 | ||
| Senior Full Stack .NET Developer,BBG Management,"https://www.dice.com/jobs/detail/Senior-Full-Stack-.NET-Developer-BBG-Management-Santa-Monica-CA-90401/90559744/493604396380724?icid=sr24-1p&q=Web Developer&l=Los Angeles, CA","Santa Monica, CA","Thu, 12 Jan 2017",90559744,493604396380724 | ||
| Software Engineer 2/3,NORTHROP GRUMMAN,"https://www.dice.com/jobs/detail/Software-Engineer-2%26%23473-NORTHROP-GRUMMAN-Redondo-Beach-CA-90277/ngitbot/16028937?icid=sr25-1p&q=Web Developer&l=Los Angeles, CA","Redondo Beach, CA","Thu, 12 Jan 2017",ngitbot,16028937 | ||
| Software Engineer 2/3,NORTHROP GRUMMAN,"https://www.dice.com/jobs/detail/Software-Engineer-2%26%23473-NORTHROP-GRUMMAN-Redondo-Beach-CA-90277/ngitbot/16022398?icid=sr26-1p&q=Web Developer&l=Los Angeles, CA","Redondo Beach, CA","Thu, 12 Jan 2017",ngitbot,16022398 | ||
| Measurements & Software Engineer 3,NORTHROP GRUMMAN,"https://www.dice.com/jobs/detail/Measurements-%26-Software-Engineer-3-NORTHROP-GRUMMAN-Redondo-Beach-CA-90277/ngitbot/16024814?icid=sr27-1p&q=Web Developer&l=Los Angeles, CA","Redondo Beach, CA","Thu, 12 Jan 2017",ngitbot,16024814 | ||
| Software Engineer,NORTHROP GRUMMAN,"https://www.dice.com/jobs/detail/Software-Engineer-NORTHROP-GRUMMAN-Redondo-Beach-CA-90277/ngitbot/16023898?icid=sr28-1p&q=Web Developer&l=Los Angeles, CA","Redondo Beach, CA","Thu, 12 Jan 2017",ngitbot,16023898 | ||
| Software Engineer 2,NORTHROP GRUMMAN,"https://www.dice.com/jobs/detail/Software-Engineer-2-NORTHROP-GRUMMAN-Woodland-Hills-CA-91302/ngitbot/16020844?icid=sr29-1p&q=Web Developer&l=Los Angeles, CA","Woodland Hills, CA","Thu, 12 Jan 2017",ngitbot,16020844 | ||
| Software Engineer,NORTHROP GRUMMAN,"https://www.dice.com/jobs/detail/Software-Engineer-NORTHROP-GRUMMAN-Redondo-Beach-CA-90277/ngitbot/16023894?icid=sr30-1p&q=Web Developer&l=Los Angeles, CA","Redondo Beach, CA","Thu, 12 Jan 2017",ngitbot,16023894 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| # all 3 of these requires are necessary | ||
| require 'rubygems' | ||
| require 'bundler/setup' | ||
| require 'mechanize' | ||
|
|
||
| class PageFormatter | ||
|
|
||
| def self.get_page_from(url) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This class is not necessary. |
||
| agent = Mechanize.new { |agent| | ||
| agent.user_agent_alias = 'Windows Chrome' | ||
| agent.history_added = Proc.new { sleep 0.5 } | ||
| } | ||
| page = agent.get(url) | ||
| end | ||
| end | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| require 'csv' | ||
|
|
||
| class ResultSaver | ||
|
|
||
| def self.render(jobs) | ||
| jobs.each do |job| | ||
| puts "**************** Job *****************" | ||
| puts "\tTitle : #{job.title}" | ||
| puts "\tCompany : #{job.company_name}" | ||
| puts "\tLink : #{job.link_on_dice}" | ||
| puts "\tLocation : #{job.location}" | ||
| puts "\tDate : #{job.posting_date}" | ||
| puts "\tCompany ID : #{job.company_id}" | ||
| puts "\tJob ID : #{job.id}" | ||
| end | ||
| end | ||
|
|
||
| def self.save(jobs) | ||
| csv = CSV.open('jobs.csv', 'a') | ||
| csv << ['Title', 'Company Name', 'Link', 'Location', 'Posted Date' , 'Company ID' , 'Job ID'] | ||
| jobs.each do |job| | ||
| job_array = [job.title, job.company_name, job.link_on_dice, job.location, job.posting_date, job.company_id, job.id] | ||
| csv << job_array | ||
| end | ||
| csv.close | ||
| end | ||
|
|
||
| end |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| class ScrapeDice | ||
|
|
||
| def self.extract_title(job) | ||
| job.css('h3 a').attribute('title').value | ||
| end | ||
|
|
||
| def self.extract_company_name(job) | ||
| job.css('ul')[1].children[1].children[2].attributes['title'].value | ||
| end | ||
|
|
||
| def self.extract_location(job) | ||
| job.css('ul')[1].children[3].attributes['title'].value | ||
| end | ||
|
|
||
| def self.extract_link(job) | ||
| job.css('h3 a').attribute('href').value | ||
| end | ||
|
|
||
| def self.extract_cid(job_page) | ||
| company_header_info = Nokogiri::HTML(job_page.body).css('.company-header-info') | ||
| if company_header_info[0].children.size == 5 | ||
| company_id = company_header_info[0].children[1].children[1].children[0].text | ||
| else | ||
| company_id = company_header_info[0].children[3].children[1].children[0].text | ||
| end | ||
| end | ||
|
|
||
| def self.extract_jid(job_page) | ||
| company_header_info = Nokogiri::HTML(job_page.body).css('.company-header-info') | ||
| if company_header_info[0].children.size == 5 | ||
| job_id = company_header_info[0].children[3].children[1].children[0].text | ||
| else | ||
| job_id = company_header_info[0].children[5].children[1].children[0].text | ||
| end | ||
| end | ||
|
|
||
| def self.extract_posting_date(job) | ||
| cur_time = Time.now | ||
| if job.css('ul')[1].children[5].text.split[0] == 'moments' | ||
| unit = "second" | ||
| how_many = 0 | ||
| else | ||
| unit = job.css('ul')[1].children[5].text.split[1] | ||
| how_many = job.css('ul')[1].children[5].text.split[0].to_i | ||
| end | ||
| unit_wo_s = unit | ||
| unit_wo_s = unit.chop unless how_many == 1 | ||
| case unit_wo_s | ||
| when "second" | ||
| how_many_seconds = how_many | ||
| actual_time = cur_time - how_many | ||
| when "minute" | ||
| how_many_seconds = how_many * 60 | ||
| actual_time = cur_time - how_many_seconds | ||
| when "hour" | ||
| how_many_seconds = how_many * 60 * 60 | ||
| actual_time = cur_time - how_many_seconds | ||
| when "day" | ||
| how_many_seconds = how_many * 60 * 60 * 24 | ||
| actual_time = cur_time - how_many_seconds | ||
| when "week" | ||
| how_many_seconds = how_many * 60 * 60 * 24 * 7 | ||
| actual_time = cur_time - how_many_seconds | ||
| when "month" | ||
| how_many_seconds = how_many * 60.0 * 60.0 * 24.0 * 7.0 * 4.3 | ||
| actual_time = cur_time - how_many_seconds | ||
| when "year" | ||
| how_many_seconds = how_many * 60 * 60 * 24 * 7 * 52 | ||
| actual_time = cur_time - how_many_seconds | ||
| else | ||
| actual_time = nil | ||
| puts 'Invalid Entry' | ||
| end | ||
| actual_time.strftime('%a, %d %b %Y') | ||
| end | ||
| end |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| class SearchCriteria | ||
|
|
||
| attr_accessor :content, :location, :distance, :segment, :title, :company, :type, :telecommute | ||
|
|
||
| def initialize(content = "Web Developer", location = "Los Angeles, CA", distance = 30, segment = nil, title = nil, company = nil, type = nil, telecommute = nil) | ||
| @content = content | ||
| @location = location | ||
| @distance = distance | ||
| @segment = segment | ||
| @title = title | ||
| @company = company | ||
| @type = type | ||
| @telecommute = telecommute | ||
| end | ||
|
|
||
| end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
all the methods except
scrapeandsaveshould beprivate