Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
/.bundle/
/.yardoc
/Gemfile.lock
/_yardoc/
/coverage/
/doc/
/pkg/
/spec/reports/
/tmp/
3 changes: 3 additions & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
--color
--require spec_helper
--format documentation
8 changes: 8 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
source 'https://rubygems.org'

# Specify your gem's dependencies in pingpongpairing.gemspec
gem "rspec", "~> 3.5.0"
gem "guard", "~> 2.14"
gem "guard-shell", "~> 0.7.1"
gem "guard-rspec", "~> 4.7.3"
gem "mechanize"
22 changes: 22 additions & 0 deletions Guardfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
guard :rspec, cmd: "bundle exec rspec" do
# watch /lib/ files
watch(%r{^lib/(.+).rb$}) do |m|
"spec/#{m[1]}_spec.rb"
end

watch(%r{^lib/gameboard/(.+).rb$}) do |m|
"spec/#{m[1]}_spec.rb"
end

# watch /spec/ files
watch(%r{^spec/(.+).rb$}) do |m|
"spec/#{m[1]}.rb"
end
end

guard :shell do
watch %r{^pair/.*\.rb$} do |m|
n m[0], 'Changed'
`bundle exec exe/sendevent "http://localhost:5000" #{m[0]}`
end
end
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@
Bottom feeding your way across the web

[A Ruby-based web scraping and parsing project which uses NokoGiri, Mechanize, Open-uri. From the Viking Code School.](http://www.vikingcodeschool.com)


John Sawyer
16 changes: 16 additions & 0 deletions exe/parallel
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

for cmd in "$@"; do {
echo "Process \"$cmd\" started";
$cmd & pid=$!
PID_LIST+=" $pid";
} done

trap "kill $PID_LIST" SIGINT

echo "Parallel processes have started";

wait $PID_LIST

echo
echo "All processes have completed";
3 changes: 3 additions & 0 deletions exe/pingpong
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
my_dir=`dirname $0`
$my_dir/parallel "$my_dir/tunnel" "$my_dir/webhook" "bundle exec guard start -i"
24 changes: 24 additions & 0 deletions exe/sendevent
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env ruby
require "net/http"
require "uri"
require "json"

url = "#{ARGV[0]}/event"
file_to_serialize = ARGV[1]
file = File.open(file_to_serialize, "r")
contents = file.read
file.close



new_json = {
filename: file_to_serialize.split('/')[-1],
data: "#{contents}"
}

req_body = JSON.generate(new_json, quirks_mode: true)
uri = URI.parse(url)
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Post.new(uri.request_uri, 'Content-Type' => 'application/json')
request.body = req_body
resp = http.request(request)
2 changes: 2 additions & 0 deletions exe/tunnel
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
ngrok http 5000
18 changes: 18 additions & 0 deletions exe/webhook
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env ruby
require 'sinatra'
require 'json'

set :port, 5000

post '/event' do
status 204 #successful request with no body content

request.body.rewind
p request_payload = JSON.parse(request.body.read)

#append the payload to a file
File.open("spec/#{request_payload["filename"]}", "w") do |f|
f.puts(request_payload["data"])
end
end

851 changes: 851 additions & 0 deletions jobs_2016_11_14.csv

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions lib/job_search.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
require_relative 'parse_dice'
require_relative 'parse_indeed'
require 'csv'

class JobSearch

def initialize(term, location = nil)
scrapers = [ParseDice.new(term, location),ParseIndeed.new(term, location)]
to_csv(run_scrapers(scrapers))
end

def run_scrapers(scrapers)
results = []
scrapers.each do |scraper|
results << scraper.search
end
results.flatten
end


def to_csv(results)
time = Time.now.strftime("%Y_%m_%d")
CSV::open("jobs_#{time}.csv", "w+") do |csv|
csv << ["Title", "Link", "Description", "Company", "Company Site", "Location", "Date"]
results.compact.uniq.each do |result|
csv << [result[:title], result[:link], result[:desc], result[:company], result[:company_url], result[:location], result[:date]]
end
end
end


end


dice = JobSearch.new( 'Developer', '33613' )
38 changes: 38 additions & 0 deletions lib/parse_dice.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
require_relative 'web_scraper'

class ParseDice < WebScraper

def initialize(term, location = nil)
@agent = Mechanize.new
@term = term
@location = "-l-#{location}-radius-30" if location
end

def build_url(page)
"https://www.dice.com/jobs/jtype-Full\%20Time-q-#{@term}#{@location}-startPage-#{page}-limit-120-jobs.html"
end

def organize(results)
jobs = []
results = results.search("div.serp-result-content")
results.each do |result|
h3 = result.search("h3")[0]
title = h3.text.strip
link = h3.search("a")[0]["href"]
short_desc = result.search("div.shortdesc")[0].text.strip
company = result.search("li.employer .hidden-xs a")[0]
company_name = company.text.strip
company_url = company["href"]
location = result.search("li.location")[0].text.strip
posting_date = result.search("li.posted")[0].text.strip
posting_date = calculate_date(posting_date).strftime("%Y_%m_%d %l %p") if posting_date
#job_id = result.search("input")[0]
jobs << {title: title, link: link, desc: short_desc, company: company_name, company_url: company_url, location: location, date: posting_date}
end

jobs
end


end

52 changes: 52 additions & 0 deletions lib/parse_indeed.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
require_relative 'web_scraper'

class ParseIndeed < WebScraper


def initialize(term, location = nil)
@agent = Mechanize.new
@term = term
@location = "&l=#{location}" if location
end

def build_url(page)
"http://www.indeed.com/jobs?q=#{@term}#{@location}&jt=fulltime&start=#{(page-1)*10}"
end

def organize(results)
next_page = results.search("span.np")[0]
if next_page
if next_page.text.strip == "« Previous"
next_page = results.search("span.np")[1]
return [] unless next_page
end
end
parse_results(results.search("div.result"))
end

def parse_results(results)
jobs = []
results.each do |result|
h2 = result.search("h2.jobtitle")[0]
if h2
title = h2.text.strip
link = h2.search("a")[0]["href"]
else
title = result.search("a.jobtitle")[0].text.strip
link = result.search("a.jobtitle")[0]["href"]
end
short_desc = result.search("span.summary")[0].text.strip
company = result.search("span.company")[0]
company_name = company.text.strip
company_url = company.search("a")[0]
company_url = company_url["href"] if company_url
location = result.search("span.location")[0].text.strip
posting_date = result.search("span.date")[0].text.strip
posting_date = calculate_date(posting_date).strftime("%Y_%m_%d %l %p") if posting_date
#job_id = result.search("input")[0]
jobs << {title: title, link: link, desc: short_desc, company: company_name, company_url: company_url, location: location, date: posting_date}
end
jobs
end

end
47 changes: 47 additions & 0 deletions lib/web_scraper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
require 'mechanize'
require 'active_support/core_ext/numeric/time'

class WebScraper

attr_reader :agent

def initialize
@agent = Mechanize.new
end

def organize
raise NotImplementedError("Methods Need to be defined by subclass")
end

def build_url
raise NotImplementedError("Methods Need to be defined by subclass")
end

def search(page = 1)
sleep(1)
puts build_url(page)
results = organize(agent.get(build_url(page)))
results << search(page + 1) if results.length > 0
end

def calculate_date(date_string)
arr = date_string.split(" ")
arr[0] = arr[0].to_i
case arr[1]
when /minute/
arr[0].minutes.ago
when /hour/
arr[0].hours.ago
when /day/
arr[0].days.ago
when /week/
arr[0].weeks.ago
when /month/
arr[0].months.ago
end
end


end


Loading