From bcdca397910520f8c4b0e344ae6f31b327832dad Mon Sep 17 00:00:00 2001 From: CountCulture Date: Mon, 22 Mar 2021 09:02:36 +0000 Subject: [PATCH 01/30] Initial commit (probably not working properly) to share ideas --- lib/openc_bot/helpers/persistence_handler.rb | 76 +++++++++++++++++++ .../helpers/pseudo_machine_fetcher.rb | 52 +++++++++++++ .../helpers/pseudo_machine_parser.rb | 35 +++++++++ .../helpers/pseudo_machine_transformer.rb | 41 ++++++++++ .../pseudo_machine_company_fetcher_bot.rb | 28 +++++++ .../helpers/persistence_handler_spec.rb | 27 +++++++ .../helpers/pseudo_machine_fetcher_spec.rb | 34 +++++++++ .../helpers/pseudo_machine_parser_spec.rb | 36 +++++++++ .../pseudo_machine_transformer_spec.rb | 39 ++++++++++ ...pseudo_machine_company_fetcher_bot_spec.rb | 23 ++++++ 10 files changed, 391 insertions(+) create mode 100644 lib/openc_bot/helpers/persistence_handler.rb create mode 100644 lib/openc_bot/helpers/pseudo_machine_fetcher.rb create mode 100644 lib/openc_bot/helpers/pseudo_machine_parser.rb create mode 100644 lib/openc_bot/helpers/pseudo_machine_transformer.rb create mode 100644 lib/openc_bot/pseudo_machine_company_fetcher_bot.rb create mode 100644 spec/lib/openc_bot/helpers/persistence_handler_spec.rb create mode 100644 spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb create mode 100644 spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb create mode 100644 spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb create mode 100644 spec/lib/pseudo_machine_company_fetcher_bot_spec.rb diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb new file mode 100644 index 0000000..2864add --- /dev/null +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -0,0 +1,76 @@ +# frozen_string_literal: true + +module OpencBot + module Helpers + + module PersistenceHandler + + def input_stream + # override in segment bots + end + + def output_stream + self.name.to_s[/[A-Z][a-z]+$/].downcase + end + + def acquisition_base_directory + ENV['ACQUISITION_BASE_DIRECTORY'] || "data" + end + + def acquisition_id + @@acquisition_id ||= ( + ENV["ACQUISITION_ID"] || + (in_progress_acquisition_id && !ENV["FORCE_NEW_ACQUISITION"]) || + Time.now.to_i + ) + end + + # gets the most recent in progress acquisition id, based on in processing + # directories + def in_progress_acquisition_id + in_progress_acquisitions = Dir.glob("#{acquisition_base_directory}/*_processing").sort + return if in_progress_acquisitions.empty? + in_progress_acquisitions.last.filename.sub("_processing","") + end + + def input_file_location + File.join(acquisition_directory_processing, "#{input_stream}.jsonl") + end + + def output_file_location + File.join(acquisition_directory_processing, "#{output_stream}.jsonl") + end + + def acquisition_directory_processing + processing_directory = ENV['ACQUISITION_DIRECTORY'] || File.join(acquisition_base_directory, "#{acquisition_id}_processing") + File.mkdir(processing_directory) unless Dir.exist?(processing_directory) + processing_directory + end + + def acquisition_directory_final + File.join(acquisition_base_directory, in_progress_acquisition_id) + end + + def records_processed + `wc -l "#{output_file_location}"`.strip.split(' ')[0].to_i + end + + def get_input_data + File.foreach(input_file_location) do |line| + yield JSON.parse(line) + end + end + + def persist(res) + File.open(output_file_location, "a") do |f| + f.puts res.to_json + end + end + + private + def mark_acquisition_directory_as_finished_processing + File.rename(acquisition_directory_processing, acquisition_directory_final) + end + end + end +end diff --git a/lib/openc_bot/helpers/pseudo_machine_fetcher.rb b/lib/openc_bot/helpers/pseudo_machine_fetcher.rb new file mode 100644 index 0000000..35d08f3 --- /dev/null +++ b/lib/openc_bot/helpers/pseudo_machine_fetcher.rb @@ -0,0 +1,52 @@ +require "openc_bot/helpers/persistence_handler" + +module OpencBot + module Helpers + module PseudoMachineFetcher + include OpencBot::Helpers::PersistenceHandler + include OpencBot::Helpers::RegisterMethods + + def dataset_based + const_defined?("DATASET_BASED") && const_get("DATASET_BASED") + end + + def run + fetch_data_results = fetch_data + # ignore stale for the moment + # update_stale_results = update_stale + res = {} + res.merge!(fetch_data_results) if fetch_data_results.is_a?(Hash) + res + rescue Exception => e + send_error_report(e, options) + raise e + end + + def fetch_data_via_dataset + # to be implemented by fetcher code that includes this + # should persist data using persist(datum) + end + + def fetch_data + start_time = Time.now.utc + res = {} + if use_alpha_search + fetch_data_via_alpha_search + res[:run_type] = "alpha" + elsif dataset_based + fetch_data_via_dataset + res[:run_type] = "dataset" + else + new_highest_numbers = fetch_data_via_incremental_search + res[:run_type] = "incremental" + res[:output] = "New highest numbers = #{new_highest_numbers.inspect}" + end + res.merge(fetched: records_processed, fetch_start: start_time, fetch_end: Time.now.utc) + rescue OpencBot::OutOfPermittedHours, OpencBot::SourceClosedForMaintenance, Interrupt, SystemExit => e + res.merge!({ fetch_data_output: { error: exception_to_object(e) } }) + rescue StandardError => e + res.merge!({ fetch_data_error: { error: exception_to_object(e) } }) + end + end + end +end diff --git a/lib/openc_bot/helpers/pseudo_machine_parser.rb b/lib/openc_bot/helpers/pseudo_machine_parser.rb new file mode 100644 index 0000000..c75fb2d --- /dev/null +++ b/lib/openc_bot/helpers/pseudo_machine_parser.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +require "openc_bot/helpers/persistence_handler" + +module OpencBot + module Helpers + module PseudoMachineParser + include OpencBot::Helpers::PersistenceHandler + + def input_stream + "fetcher" + end + + def parse(fetched_datum) + # implement in bot + # may return either a single parsed datum or an array of data from + # single fetched_datum, e.g. if fetched_datum is a CSV file + end + + def run + start_time = Time.now.utc + counter = 0 + get_input_data do |fetched_datum| + parsed_data = parse(fetched_datum) + parsed_data = [parsed_data] unless parsed_data.is_a?(Array) + parsed_data.each do |parsed_datum| + persist(parsed_datum) + counter += 1 + end + end + {parsed: counter, parser_start: start_time, parser_end: Time.now.utc} + end + end + end +end diff --git a/lib/openc_bot/helpers/pseudo_machine_transformer.rb b/lib/openc_bot/helpers/pseudo_machine_transformer.rb new file mode 100644 index 0000000..64e4716 --- /dev/null +++ b/lib/openc_bot/helpers/pseudo_machine_transformer.rb @@ -0,0 +1,41 @@ +require "openc_bot/helpers/persistence_handler" + +module OpencBot + module Helpers + module PseudoMachineTransformer + include OpencBot::Helpers::PersistenceHandler + include OpencBot::Helpers::RegisterMethods + + def input_stream + "parser" + end + + def encapsulate_as_per_schema(parsed_datum) + # define in bot + # note: should explicitly include the jurisdiction code as this can't + # be inferred from the transformer name + end + + def run + counter = 0 + start_time = Time.now.utc + get_input_data do |json_data| + entity_datum = encapsulate_as_per_schema(json_data) + validation_errors = validate_datum(entity_datum) + persist(entity_datum) + save_entity!(entity_datum) if development? || ENV["SAVE_DATA_IN_SQLITE"] + return unless validation_errors.blank? + counter += 1 + end + { transformed: counter, + transformer_start: start_time, + transformer_end: Time.now.utc + } + end + + def schema_name + super || "company-schema" + end + end + end +end diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb new file mode 100644 index 0000000..2eeef22 --- /dev/null +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +require "openc_bot" +require "openc_bot/company_fetcher_bot" + +module OpencBot + module PseudoMachineCompanyFetcherBot + include OpencBot::CompanyFetcherBot + include OpencBot::PersistenceHandler + + # Outline bot run logic. + def update_data(options = {}) + res = {} + res.merge!(Fetcher.run) + res.merge!(Parser.run) + transformed_result_data = Transformer.run + res.merge!(transformed_result_data) + # rename directory so it will be seen by importer + mark_acquisition_directory_as_finished_processing + res[:data_directory] = acquisition_directory_final + res + rescue Exception => e + send_error_report(e, options) + raise e + end + + end +end diff --git a/spec/lib/openc_bot/helpers/persistence_handler_spec.rb b/spec/lib/openc_bot/helpers/persistence_handler_spec.rb new file mode 100644 index 0000000..621db87 --- /dev/null +++ b/spec/lib/openc_bot/helpers/persistence_handler_spec.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +require "spec_helper" +require "openc_bot" +require "openc_bot/helpers/persistence_handler" +# require "openc_bot/helpers/incremental_search" + +module ModuleThatIncludesPersistenceHandlerFoo + extend OpencBot + extend OpencBot::Helpers::PersistenceHandler +end + +describe "a module that includes PersistenceHandler" do + before do + end + + after do + # remove_test_database + end + + describe "output_stream" do + it "should return last word of module name" do + expect(ModuleThatIncludesPersistenceHandlerFoo.output_stream).to eq("foo") + end + end + +end diff --git a/spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb b/spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb new file mode 100644 index 0000000..6f5d5e3 --- /dev/null +++ b/spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +require "spec_helper" +require "openc_bot" +require "openc_bot/helpers/pseudo_machine_fetcher" +# require "openc_bot/helpers/incremental_search" + +module ModuleThatIncludesPseudoMachineFetcher + extend OpencBot + extend OpencBot::Helpers::PseudoMachineFetcher +end + +describe "a module that includes PseudoMachineFetcher" do + before do + # allow(ModuleThatIncludesIncrementalSearch).to receive(:sqlite_magic_connection).and_return(test_database_connection) + end + + after do + # remove_test_database + end + + it "has #run method" do + expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:run) + end + + it "includes PersistenceHandler methods" do + expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:persist) + end + + it "includes register_methods" do + expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:fetch_data) + end + +end diff --git a/spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb b/spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb new file mode 100644 index 0000000..d6874af --- /dev/null +++ b/spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +require "spec_helper" +require "openc_bot" +require "openc_bot/helpers/pseudo_machine_parser" +# require "openc_bot/helpers/incremental_search" + +module ModuleThatIncludesPseudoMachineParser + extend OpencBot + extend OpencBot::Helpers::PseudoMachineParser +end + +describe "a module that includes PseudoMachineParser" do + before do + # allow(ModuleThatIncludesIncrementalSearch).to receive(:sqlite_magic_connection).and_return(test_database_connection) + end + + after do + # remove_test_database + end + + it "has #run method" do + expect(ModuleThatIncludesPseudoMachineParser).to respond_to(:run) + end + + it "includes PersistenceHandler methods" do + expect(ModuleThatIncludesPseudoMachineParser).to respond_to(:persist) + end + + describe "input_stream" do + it "should return 'fetcher'" do + expect(ModuleThatIncludesPseudoMachineParser.input_stream).to eq("fetcher") + end + end + +end diff --git a/spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb b/spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb new file mode 100644 index 0000000..f63b081 --- /dev/null +++ b/spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb @@ -0,0 +1,39 @@ +# frozen_string_literal: true + +require "spec_helper" +require "openc_bot" +require "openc_bot/helpers/pseudo_machine_transformer" +# require "openc_bot/helpers/incremental_search" + +module ModuleThatIncludesPseudoMachineTransformer + extend OpencBot + extend OpencBot::Helpers::PseudoMachineTransformer +end + +describe "a module that includes PseudoMachineTransformer" do + before do + end + + after do + # remove_test_database + end + + it "has #run method" do + expect(ModuleThatIncludesPseudoMachineTransformer).to respond_to(:run) + end + + it "includes PersistenceHandler methods" do + expect(ModuleThatIncludesPseudoMachineTransformer).to respond_to(:persist) + end + + it "includes RegisterMethods methods" do + expect(ModuleThatIncludesPseudoMachineTransformer).to respond_to(:validate_datum) + end + + describe "input_stream" do + it "should return 'parser'" do + expect(ModuleThatIncludesPseudoMachineTransformer.input_stream).to eq("parser") + end + end + +end diff --git a/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb b/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb new file mode 100644 index 0000000..b14350e --- /dev/null +++ b/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +require_relative "../spec_helper" +require "openc_bot" +require "openc_bot/pseudo_machine_company_fetcher_bot" + +Mail.defaults do + delivery_method :test # no, don't send emails when testing +end + +module TestPseudoMachineCompaniesFetcher + extend OpencBot::PseudoMachineCompanyFetcherBot +end + +describe "A module that extends PseudoMachineCompanyFetcherBot" do + before do + end + + it "includes CompanyFetcherBot methods" do + expect(TestPseudoMachineCompaniesFetcher).to respond_to(:inferred_jurisdiction_code) + end + +end From 5a8315cc9843e83433b6d11d21551a6ca96316b6 Mon Sep 17 00:00:00 2001 From: Shyam Peri Date: Tue, 23 Mar 2021 15:27:08 +0530 Subject: [PATCH 02/30] flow changes; added fetch; parse; transform tasks --- Gemfile.lock | 47 +++++++------- lib/openc_bot/company_fetcher_bot.rb | 1 + lib/openc_bot/helpers/persistence_handler.rb | 47 +++++++++----- .../helpers/pseudo_machine_fetcher.rb | 9 +-- .../helpers/pseudo_machine_parser.rb | 7 ++- .../pseudo_machine_register_methods.rb | 20 ++++++ .../helpers/pseudo_machine_transformer.rb | 27 ++++---- lib/openc_bot/helpers/register_methods.rb | 4 ++ lib/openc_bot/helpers/reporting.rb | 2 +- .../pseudo_machine_company_fetcher_bot.rb | 63 ++++++++++++++++--- lib/openc_bot/tasks.rb | 31 ++++++++- .../helpers/persistence_handler_spec.rb | 15 +---- .../helpers/pseudo_machine_fetcher_spec.rb | 31 ++++----- .../helpers/pseudo_machine_parser_spec.rb | 18 ++---- .../pseudo_machine_transformer_spec.rb | 17 ++--- ...pseudo_machine_company_fetcher_bot_spec.rb | 12 ++-- spec/spec_helper.rb | 1 + 17 files changed, 221 insertions(+), 131 deletions(-) create mode 100644 lib/openc_bot/helpers/pseudo_machine_register_methods.rb diff --git a/Gemfile.lock b/Gemfile.lock index e9c0866..ca073a1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -34,48 +34,51 @@ GEM tzinfo (~> 1.1) addressable (2.7.0) public_suffix (>= 2.0.2, < 5.0) - ast (2.4.1) - backports (3.18.2) + ast (2.4.2) + backports (3.20.2) byebug (10.0.2) coderay (1.1.3) - concurrent-ruby (1.1.7) - crack (0.4.4) + concurrent-ruby (1.1.8) + crack (0.4.5) + rexml diff-lcs (1.4.4) hashdiff (1.0.1) httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) - json (2.3.1) + json (2.5.1) json-schema (2.8.1) addressable (>= 2.4) mail (2.7.1) mini_mime (>= 0.1.1) method_source (1.0.0) mini_mime (1.0.2) - mini_portile2 (2.4.0) - minitest (5.14.2) + mini_portile2 (2.5.0) + minitest (5.14.4) mono_logger (1.1.0) multi_json (1.15.0) mustermann (1.1.1) ruby2_keywords (~> 0.0.1) - nokogiri (1.10.10) - mini_portile2 (~> 2.4.0) + nokogiri (1.11.2) + mini_portile2 (~> 2.5.0) + racc (~> 1.4) parallel (1.20.1) - parser (2.7.2.0) + parser (3.0.0.0) ast (~> 2.4.1) - pry (0.13.1) + pry (0.14.0) coderay (~> 1.1) method_source (~> 1.0) public_suffix (4.0.6) + racc (1.5.2) rack (2.2.3) rack-protection (2.1.0) rack rainbow (3.0.0) rake (12.3.3) redis (4.2.5) - redis-namespace (1.8.0) + redis-namespace (1.8.1) redis (>= 3.0.4) - regexp_parser (2.0.0) + regexp_parser (2.1.1) resque (2.0.0) mono_logger (~> 1.0) multi_json (~> 1.0) @@ -88,15 +91,15 @@ GEM rspec-core (~> 3.10.0) rspec-expectations (~> 3.10.0) rspec-mocks (~> 3.10.0) - rspec-core (3.10.0) + rspec-core (3.10.1) rspec-support (~> 3.10.0) - rspec-expectations (3.10.0) + rspec-expectations (3.10.1) diff-lcs (>= 1.2.0, < 2.0) rspec-support (~> 3.10.0) - rspec-mocks (3.10.0) + rspec-mocks (3.10.2) diff-lcs (>= 1.2.0, < 2.0) rspec-support (~> 3.10.0) - rspec-support (3.10.0) + rspec-support (3.10.2) rubocop (0.93.1) parallel (~> 1.10) parser (>= 2.7.1.5) @@ -106,13 +109,13 @@ GEM rubocop-ast (>= 0.6.0) ruby-progressbar (~> 1.7) unicode-display_width (>= 1.4.0, < 2.0) - rubocop-ast (1.3.0) + rubocop-ast (1.4.1) parser (>= 2.7.1.5) rubocop-rspec (1.44.1) rubocop (~> 0.87) rubocop-ast (>= 0.7.1) - ruby-progressbar (1.10.1) - ruby2_keywords (0.0.2) + ruby-progressbar (1.11.0) + ruby2_keywords (0.0.4) scraperwiki (3.0.2) httpclient sqlite_magic @@ -125,7 +128,7 @@ GEM statsd-instrument (1.7.2) thread_safe (0.3.6) tilt (2.0.10) - tzinfo (1.2.8) + tzinfo (1.2.9) thread_safe (~> 0.1) unicode-display_width (1.7.0) vegas (0.1.11) @@ -149,4 +152,4 @@ DEPENDENCIES webmock (~> 1.20) BUNDLED WITH - 1.17.3 + 2.1.4 diff --git a/lib/openc_bot/company_fetcher_bot.rb b/lib/openc_bot/company_fetcher_bot.rb index 108b874..676e2e3 100644 --- a/lib/openc_bot/company_fetcher_bot.rb +++ b/lib/openc_bot/company_fetcher_bot.rb @@ -56,6 +56,7 @@ def run(options = {}) # we may get a string back, or something else update_data_results = { output: update_data_results.to_s } unless update_data_results.is_a?(Hash) report_run_results(update_data_results.merge(started_at: start_time, ended_at: Time.now, status_code: "1")) + update_data_results end def schema_name diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index 2864add..09a9ff7 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -2,35 +2,31 @@ module OpencBot module Helpers - + # Persistence handler for bot activities module PersistenceHandler - def input_stream # override in segment bots end def output_stream - self.name.to_s[/[A-Z][a-z]+$/].downcase + name.to_s[/[A-Z][a-z]+$/].downcase end def acquisition_base_directory - ENV['ACQUISITION_BASE_DIRECTORY'] || "data" + ENV["ACQUISITION_BASE_DIRECTORY"] || "data" end def acquisition_id - @@acquisition_id ||= ( - ENV["ACQUISITION_ID"] || - (in_progress_acquisition_id && !ENV["FORCE_NEW_ACQUISITION"]) || - Time.now.to_i - ) + @acquisition_id ||= ENV["FORCE_NEW_ACQUISITION"].blank? ? ENV["ACQUISITION_ID"] || in_progress_acquisition_id || Time.now.to_i.to_s : Time.now.to_i.to_s end # gets the most recent in progress acquisition id, based on in processing # directories def in_progress_acquisition_id + return @acquisition_id unless @acquisition_id.blank? + in_progress_acquisitions = Dir.glob("#{acquisition_base_directory}/*_processing").sort - return if in_progress_acquisitions.empty? - in_progress_acquisitions.last.filename.sub("_processing","") + in_progress_acquisitions.blank? ? nil : in_progress_acquisitions.last.split("/").last.sub("_processing", "") end def input_file_location @@ -42,23 +38,33 @@ def output_file_location end def acquisition_directory_processing - processing_directory = ENV['ACQUISITION_DIRECTORY'] || File.join(acquisition_base_directory, "#{acquisition_id}_processing") - File.mkdir(processing_directory) unless Dir.exist?(processing_directory) + processing_directory = ENV["ACQUISITION_DIRECTORY"] || File.join(acquisition_base_directory, "#{acquisition_id}_processing") + unless Dir.exist?(processing_directory) + if ENV["ACQUISITION_ID"] + mark_finished_acquisition_directory_as_processing(processing_directory) + else + FileUtils.mkdir(processing_directory) + end + end + @acquisition_directory ||= processing_directory processing_directory end def acquisition_directory_final - File.join(acquisition_base_directory, in_progress_acquisition_id) + @acquisition_directory = File.join(acquisition_base_directory, in_progress_acquisition_id) end def records_processed - `wc -l "#{output_file_location}"`.strip.split(' ')[0].to_i + `wc -l "#{output_file_location}"`.strip.split[0].to_i end - def get_input_data + def input_data File.foreach(input_file_location) do |line| yield JSON.parse(line) end + rescue Errno::ENOENT + warn "No such file: #{input_file_location} present" + [] end def persist(res) @@ -67,10 +73,19 @@ def persist(res) end end + def acquisition_directory + @acquisition_directory || acquisition_directory_processing + end + private + def mark_acquisition_directory_as_finished_processing File.rename(acquisition_directory_processing, acquisition_directory_final) end + + def mark_finished_acquisition_directory_as_processing(processing_directory) + File.rename(acquisition_directory_final, processing_directory) + end end end end diff --git a/lib/openc_bot/helpers/pseudo_machine_fetcher.rb b/lib/openc_bot/helpers/pseudo_machine_fetcher.rb index 35d08f3..bf79a7d 100644 --- a/lib/openc_bot/helpers/pseudo_machine_fetcher.rb +++ b/lib/openc_bot/helpers/pseudo_machine_fetcher.rb @@ -1,10 +1,14 @@ +# frozen_string_literal: true + require "openc_bot/helpers/persistence_handler" +require "openc_bot/helpers/pseudo_machine_register_methods" module OpencBot module Helpers + # Fetching activities module PseudoMachineFetcher + include OpencBot::Helpers::PsuedoMachineRegisterMethods include OpencBot::Helpers::PersistenceHandler - include OpencBot::Helpers::RegisterMethods def dataset_based const_defined?("DATASET_BASED") && const_get("DATASET_BASED") @@ -17,9 +21,6 @@ def run res = {} res.merge!(fetch_data_results) if fetch_data_results.is_a?(Hash) res - rescue Exception => e - send_error_report(e, options) - raise e end def fetch_data_via_dataset diff --git a/lib/openc_bot/helpers/pseudo_machine_parser.rb b/lib/openc_bot/helpers/pseudo_machine_parser.rb index c75fb2d..65e01bd 100644 --- a/lib/openc_bot/helpers/pseudo_machine_parser.rb +++ b/lib/openc_bot/helpers/pseudo_machine_parser.rb @@ -4,6 +4,7 @@ module OpencBot module Helpers + # Parsing activities module PseudoMachineParser include OpencBot::Helpers::PersistenceHandler @@ -20,15 +21,17 @@ def parse(fetched_datum) def run start_time = Time.now.utc counter = 0 - get_input_data do |fetched_datum| + input_data do |fetched_datum| parsed_data = parse(fetched_datum) parsed_data = [parsed_data] unless parsed_data.is_a?(Array) parsed_data.each do |parsed_datum| + next if parsed_datum.blank? + persist(parsed_datum) counter += 1 end end - {parsed: counter, parser_start: start_time, parser_end: Time.now.utc} + { parsed: counter, parser_start: start_time, parser_end: Time.now.utc } end end end diff --git a/lib/openc_bot/helpers/pseudo_machine_register_methods.rb b/lib/openc_bot/helpers/pseudo_machine_register_methods.rb new file mode 100644 index 0000000..0d203f4 --- /dev/null +++ b/lib/openc_bot/helpers/pseudo_machine_register_methods.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +require "openc_bot/helpers/register_methods" + +module OpencBot + module Helpers + # Configuration/Methods overrides for Pseduo Machine related activities + module PsuedoMachineRegisterMethods + include OpencBot::Helpers::RegisterMethods + + def primary_key_name + const_defined?("PRIMARY_KEY_NAME") ? const_get("PRIMARY_KEY_NAME") : :company_number + end + + def exception_to_object(exp) + { klass: exp.class.to_s, message: exp.message, backtrace: exp.backtrace } + end + end + end +end diff --git a/lib/openc_bot/helpers/pseudo_machine_transformer.rb b/lib/openc_bot/helpers/pseudo_machine_transformer.rb index 64e4716..ef1839c 100644 --- a/lib/openc_bot/helpers/pseudo_machine_transformer.rb +++ b/lib/openc_bot/helpers/pseudo_machine_transformer.rb @@ -1,10 +1,15 @@ +# frozen_string_literal: true + require "openc_bot/helpers/persistence_handler" +require "openc_bot/helpers/pseudo_machine_register_methods" module OpencBot module Helpers + # Transformer activities module PseudoMachineTransformer + include OpencBot include OpencBot::Helpers::PersistenceHandler - include OpencBot::Helpers::RegisterMethods + include OpencBot::Helpers::PsuedoMachineRegisterMethods def input_stream "parser" @@ -19,18 +24,18 @@ def encapsulate_as_per_schema(parsed_datum) def run counter = 0 start_time = Time.now.utc - get_input_data do |json_data| + input_data do |json_data| entity_datum = encapsulate_as_per_schema(json_data) - validation_errors = validate_datum(entity_datum) - persist(entity_datum) - save_entity!(entity_datum) if development? || ENV["SAVE_DATA_IN_SQLITE"] - return unless validation_errors.blank? - counter += 1 + unless entity_datum.blank? + validation_errors = validate_datum(entity_datum) + raise "\n#{JSON.pretty_generate([entity_datum, validation_errors])}" unless validation_errors.blank? + + persist(entity_datum) + save_entity(entity_datum) unless ENV["NO_SAVE_DATA_IN_SQLITE"] + counter += 1 + end end - { transformed: counter, - transformer_start: start_time, - transformer_end: Time.now.utc - } + { transformed: counter, transformer_start: start_time, transformer_end: Time.now.utc } end def schema_name diff --git a/lib/openc_bot/helpers/register_methods.rb b/lib/openc_bot/helpers/register_methods.rb index 227cb92..56daab7 100644 --- a/lib/openc_bot/helpers/register_methods.rb +++ b/lib/openc_bot/helpers/register_methods.rb @@ -16,6 +16,10 @@ module RegisterMethods MAX_BUSY_RETRIES = 3 + def exception_to_object(exp) + { klass: exp.class.to_s, message: exp.message, backtrace: exp.backtrace } + end + def allowed_hours if const_defined?("ALLOWED_HOURS") const_get("ALLOWED_HOURS").to_a diff --git a/lib/openc_bot/helpers/reporting.rb b/lib/openc_bot/helpers/reporting.rb index 9952907..407487f 100644 --- a/lib/openc_bot/helpers/reporting.rb +++ b/lib/openc_bot/helpers/reporting.rb @@ -41,7 +41,7 @@ def send_error_report(exception, options = {}) def send_run_report(run_results = nil) subject = "#{name} successfully ran" db_filesize = File.size?(db_location) - body = "No problems to report. db is #{db_location}, #{db_filesize} bytes. Last modified: #{File.stat(db_location).mtime}" + body = "No problems to report. db is #{db_location}, #{db_filesize} bytes. Last modified: #{File.exist?(db_location) ? File.stat(db_location).mtime : ''}" body += "\nRun results = #{run_results.inspect}" unless run_results.blank? send_report(subject: subject, body: body) end diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index 2eeef22..aa2e1d6 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -2,27 +2,74 @@ require "openc_bot" require "openc_bot/company_fetcher_bot" +require "openc_bot/helpers/persistence_handler" module OpencBot + # Psuedo machine fetcher bot top level class to orchestrate activities module PseudoMachineCompanyFetcherBot include OpencBot::CompanyFetcherBot - include OpencBot::PersistenceHandler + include OpencBot::Helpers::PersistenceHandler + + def bot_name + @bot_name ||= Dir.pwd.split("/").last + end + + def callable_from_file_name(underscore_file_name) + bot_klass = klass_from_file_name(underscore_file_name) + if bot_klass.respond_to?(:new) + bot_klass.new + else + bot_klass + end + end + + def klass_from_file_name(underscore_file_name) + camelcase_version = underscore_file_name.split("_").map(&:capitalize).join + Object.const_get(camelcase_version) + end + + def db_name + "#{bot_name.downcase}.db" + end + + def processing_states + return @processing_states unless @processing_states.nil? + + state_file = "#{acquisition_directory}/processing_states.json" + @processing_states = if File.exist?(state_file) + JSON.parse(IO.read(state_file)) + else + [] + end + end # Outline bot run logic. def update_data(options = {}) res = {} - res.merge!(Fetcher.run) - res.merge!(Parser.run) - transformed_result_data = Transformer.run - res.merge!(transformed_result_data) + bot_namespace = callable_from_file_name(bot_name) + unless processing_states.include?("fetcher") + res.merge!(bot_namespace::Fetcher.run) + processing_states << "fetcher" + end + unless processing_states.include?("parser") + res.merge!(bot_namespace::Parser.run) + processing_states << "parser" + end + unless @processing_states.include?("transformer") + res.merge!(bot_namespace::Transformer.run) + processing_states << "transformer" + end + res[:data_directory] = acquisition_directory_final # rename directory so it will be seen by importer mark_acquisition_directory_as_finished_processing - res[:data_directory] = acquisition_directory_final + raise "\n#{JSON.pretty_generate(res)}" if res.key?(:fetch_data_error) || res.key?(:update_stale_error) + res - rescue Exception => e + rescue StandardError => e send_error_report(e, options) raise e + ensure + IO.write("#{acquisition_directory}/processing_states.json", processing_states.to_json) end - end end diff --git a/lib/openc_bot/tasks.rb b/lib/openc_bot/tasks.rb index 337a753..3802d12 100644 --- a/lib/openc_bot/tasks.rb +++ b/lib/openc_bot/tasks.rb @@ -46,6 +46,33 @@ def pid_dir create_bot("company") end + desc "Run the fetch task which does all the data fetching from the fetcher" + task :fetch do |_t, _args| + bot_name = get_bot_name + only_process_running("#{bot_name}-bot:fetch") do + require_relative File.join(Dir.pwd, "lib", bot_name) + callable_from_file_name(bot_name)::Fetcher.run + end + end + + desc "Parses the entries from fetch output of specific acqusition" + task :parse do |t, args| + bot_name = get_bot_name + only_process_running("#{bot_name}-#{t.name}-#{args[:acquisition_id]}") do + require_relative File.join(Dir.pwd, "lib", bot_name) + callable_from_file_name(bot_name)::Parser.run + end + end + + desc "Transforms the entries from parse output of specific acqusition" + task :transform do |t, args| + bot_name = get_bot_name + only_process_running("#{bot_name}-#{t.name}-#{args[:acquisition_id]}") do + require_relative File.join(Dir.pwd, "lib", bot_name) + callable_from_file_name(bot_name)::Transformer.run + end + end + desc "Perform a fetcher bot update_data run without reporting and with dev/debug options" task :run do |t, args| bot_name = get_bot_name @@ -80,9 +107,9 @@ def pid_dir require_relative File.join(Dir.pwd, "lib", bot_name) runner = callable_from_file_name(bot_name) - count = runner.run(options) + res = runner.run(options) - puts "Got #{count} records" + puts res.to_json end rescue Exception => e raise e unless e.message[/already running/i] diff --git a/spec/lib/openc_bot/helpers/persistence_handler_spec.rb b/spec/lib/openc_bot/helpers/persistence_handler_spec.rb index 621db87..c59f165 100644 --- a/spec/lib/openc_bot/helpers/persistence_handler_spec.rb +++ b/spec/lib/openc_bot/helpers/persistence_handler_spec.rb @@ -3,25 +3,16 @@ require "spec_helper" require "openc_bot" require "openc_bot/helpers/persistence_handler" -# require "openc_bot/helpers/incremental_search" module ModuleThatIncludesPersistenceHandlerFoo extend OpencBot extend OpencBot::Helpers::PersistenceHandler end -describe "a module that includes PersistenceHandler" do - before do - end - - after do - # remove_test_database - end - - describe "output_stream" do - it "should return last word of module name" do +describe OpencBot::Helpers::PersistenceHandler do + context "when a module that includes PersistenceHandler" do + it "return's last word of module name" do expect(ModuleThatIncludesPersistenceHandlerFoo.output_stream).to eq("foo") end end - end diff --git a/spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb b/spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb index 6f5d5e3..bfb5c4e 100644 --- a/spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb +++ b/spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb @@ -3,32 +3,25 @@ require "spec_helper" require "openc_bot" require "openc_bot/helpers/pseudo_machine_fetcher" -# require "openc_bot/helpers/incremental_search" +require "openc_bot/helpers/persistence_handler" module ModuleThatIncludesPseudoMachineFetcher extend OpencBot extend OpencBot::Helpers::PseudoMachineFetcher end -describe "a module that includes PseudoMachineFetcher" do - before do - # allow(ModuleThatIncludesIncrementalSearch).to receive(:sqlite_magic_connection).and_return(test_database_connection) - end - - after do - # remove_test_database - end +describe OpencBot::Helpers::PseudoMachineFetcher do + context "when a module that includes PseudoMachineFetcher" do + it "has #run method" do + expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:run) + end - it "has #run method" do - expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:run) - end + it "includes PersistenceHandler methods" do + expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:persist) + end - it "includes PersistenceHandler methods" do - expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:persist) + it "includes register_methods" do + expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:fetch_data) + end end - - it "includes register_methods" do - expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:fetch_data) - end - end diff --git a/spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb b/spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb index d6874af..dc7c477 100644 --- a/spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb +++ b/spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb @@ -10,15 +10,8 @@ module ModuleThatIncludesPseudoMachineParser extend OpencBot::Helpers::PseudoMachineParser end -describe "a module that includes PseudoMachineParser" do - before do - # allow(ModuleThatIncludesIncrementalSearch).to receive(:sqlite_magic_connection).and_return(test_database_connection) - end - - after do - # remove_test_database - end - +describe OpencBot::Helpers::PseudoMachineParser +context "when a module that includes PseudoMachineParser" do it "has #run method" do expect(ModuleThatIncludesPseudoMachineParser).to respond_to(:run) end @@ -27,10 +20,7 @@ module ModuleThatIncludesPseudoMachineParser expect(ModuleThatIncludesPseudoMachineParser).to respond_to(:persist) end - describe "input_stream" do - it "should return 'fetcher'" do - expect(ModuleThatIncludesPseudoMachineParser.input_stream).to eq("fetcher") - end + it "return's fetcher" do + expect(ModuleThatIncludesPseudoMachineParser.input_stream).to eq("fetcher") end - end diff --git a/spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb b/spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb index f63b081..bf2b8c3 100644 --- a/spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb +++ b/spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb @@ -10,14 +10,8 @@ module ModuleThatIncludesPseudoMachineTransformer extend OpencBot::Helpers::PseudoMachineTransformer end -describe "a module that includes PseudoMachineTransformer" do - before do - end - - after do - # remove_test_database - end - +describe OpencBot::Helpers::PseudoMachineTransformer +context "when a module that includes PseudoMachineTransformer" do it "has #run method" do expect(ModuleThatIncludesPseudoMachineTransformer).to respond_to(:run) end @@ -30,10 +24,7 @@ module ModuleThatIncludesPseudoMachineTransformer expect(ModuleThatIncludesPseudoMachineTransformer).to respond_to(:validate_datum) end - describe "input_stream" do - it "should return 'parser'" do - expect(ModuleThatIncludesPseudoMachineTransformer.input_stream).to eq("parser") - end + it "return's parser" do + expect(ModuleThatIncludesPseudoMachineTransformer.input_stream).to eq("parser") end - end diff --git a/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb b/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb index b14350e..65ebc5f 100644 --- a/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb +++ b/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb @@ -12,12 +12,10 @@ module TestPseudoMachineCompaniesFetcher extend OpencBot::PseudoMachineCompanyFetcherBot end -describe "A module that extends PseudoMachineCompanyFetcherBot" do - before do +describe OpencBot::PseudoMachineCompanyFetcherBot do + context "when a module extends PseudoMachineCompanyFetcherBot" do + it "includes CompanyFetcherBot methods" do + expect(TestPseudoMachineCompaniesFetcher).to respond_to(:inferred_jurisdiction_code) + end end - - it "includes CompanyFetcherBot methods" do - expect(TestPseudoMachineCompaniesFetcher).to respond_to(:inferred_jurisdiction_code) - end - end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 2e346a2..8cfef50 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -8,6 +8,7 @@ WebMock.disable_net_connect!(allow_localhost: true) RSpec.configure do |config| + # noop end def remove_test_database From 557d611d09d5b35e3f71a3c44fb6f72bcfbecce2 Mon Sep 17 00:00:00 2001 From: Shyam Peri Date: Tue, 23 Mar 2021 16:00:46 +0530 Subject: [PATCH 03/30] namespace specs --- spec/lib/pseudo_machine_company_fetcher_bot_spec.rb | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb b/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb index 65ebc5f..894675f 100644 --- a/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb +++ b/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb @@ -12,10 +12,12 @@ module TestPseudoMachineCompaniesFetcher extend OpencBot::PseudoMachineCompanyFetcherBot end -describe OpencBot::PseudoMachineCompanyFetcherBot do - context "when a module extends PseudoMachineCompanyFetcherBot" do - it "includes CompanyFetcherBot methods" do - expect(TestPseudoMachineCompaniesFetcher).to respond_to(:inferred_jurisdiction_code) +module OpencBot + describe PseudoMachineCompanyFetcherBot do + context "when a module extends PseudoMachineCompanyFetcherBot" do + it "includes CompanyFetcherBot methods" do + expect(TestPseudoMachineCompaniesFetcher).to respond_to(:inferred_jurisdiction_code) + end end end end From 5b0e5836f6aaa5ecf364c7de5a2164d2d6e89d25 Mon Sep 17 00:00:00 2001 From: Shyam Peri Date: Tue, 23 Mar 2021 22:22:34 +0530 Subject: [PATCH 04/30] bind to specific ruby version --- .ruby-version | 1 + Gemfile.lock | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 .ruby-version diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 0000000..ec1cf33 --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +2.6.3 diff --git a/Gemfile.lock b/Gemfile.lock index ca073a1..d20ad01 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -152,4 +152,4 @@ DEPENDENCIES webmock (~> 1.20) BUNDLED WITH - 2.1.4 + 1.17.3 From 17e7910baa6bfc8a8f384f7bda24c8ac2fc5dcfb Mon Sep 17 00:00:00 2001 From: CountCulture Date: Wed, 31 Mar 2021 15:45:27 +0100 Subject: [PATCH 05/30] Update openc-schema to point to branch that tightens criteria --- .gitmodules | 1 + schemas | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index fab109b..9f5657f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "schemas"] path = schemas url = git://github.com/openc/openc-schema.git + branch = tighten-additional-properties diff --git a/schemas b/schemas index 051a03a..0a2f8ac 160000 --- a/schemas +++ b/schemas @@ -1 +1 @@ -Subproject commit 051a03a84b0a05d175e11737dc2a9ded0d9db647 +Subproject commit 0a2f8ac89be64caeb46c008aedf75f3e65015d84 From a7199a346f96ddfd9dd70b6c2357239f5e29762a Mon Sep 17 00:00:00 2001 From: Shyam Peri Date: Wed, 7 Apr 2021 09:20:11 +0530 Subject: [PATCH 06/30] fail fast if individual tasks in chain are called --- lib/openc_bot/helpers/pseudo_machine_fetcher.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/openc_bot/helpers/pseudo_machine_fetcher.rb b/lib/openc_bot/helpers/pseudo_machine_fetcher.rb index bf79a7d..9c5a297 100644 --- a/lib/openc_bot/helpers/pseudo_machine_fetcher.rb +++ b/lib/openc_bot/helpers/pseudo_machine_fetcher.rb @@ -44,8 +44,12 @@ def fetch_data end res.merge(fetched: records_processed, fetch_start: start_time, fetch_end: Time.now.utc) rescue OpencBot::OutOfPermittedHours, OpencBot::SourceClosedForMaintenance, Interrupt, SystemExit => e + raise e unless caller.to_s[/update_data/] + res.merge!({ fetch_data_output: { error: exception_to_object(e) } }) rescue StandardError => e + raise e unless caller.to_s[/update_data/] + res.merge!({ fetch_data_error: { error: exception_to_object(e) } }) end end From c27f012f525bf06b431e39aa7de4212c6026bf66 Mon Sep 17 00:00:00 2001 From: Shyam Peri Date: Wed, 7 Apr 2021 09:22:31 +0530 Subject: [PATCH 07/30] save_entity doesn't validate in psuedo machine as validation is default --- lib/openc_bot/helpers/pseudo_machine_register_methods.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/openc_bot/helpers/pseudo_machine_register_methods.rb b/lib/openc_bot/helpers/pseudo_machine_register_methods.rb index 0d203f4..4fd3efa 100644 --- a/lib/openc_bot/helpers/pseudo_machine_register_methods.rb +++ b/lib/openc_bot/helpers/pseudo_machine_register_methods.rb @@ -8,6 +8,10 @@ module Helpers module PsuedoMachineRegisterMethods include OpencBot::Helpers::RegisterMethods + def save_entity(entity_datum) + prepare_and_save_data(entity_datum) + end + def primary_key_name const_defined?("PRIMARY_KEY_NAME") ? const_get("PRIMARY_KEY_NAME") : :company_number end From 93b575e76743fe50c69370acdd6d29a6a14861e6 Mon Sep 17 00:00:00 2001 From: Shyam Peri Date: Wed, 7 Apr 2021 09:45:29 +0530 Subject: [PATCH 08/30] db name should not include transform in it --- lib/openc_bot/helpers/pseudo_machine_transformer.rb | 2 ++ lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/openc_bot/helpers/pseudo_machine_transformer.rb b/lib/openc_bot/helpers/pseudo_machine_transformer.rb index ef1839c..9a917b8 100644 --- a/lib/openc_bot/helpers/pseudo_machine_transformer.rb +++ b/lib/openc_bot/helpers/pseudo_machine_transformer.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true require "openc_bot/helpers/persistence_handler" +require "openc_bot/pseudo_machine_company_fetcher_bot" require "openc_bot/helpers/pseudo_machine_register_methods" module OpencBot @@ -8,6 +9,7 @@ module Helpers # Transformer activities module PseudoMachineTransformer include OpencBot + include OpencBot::PseudoMachineCompanyFetcherBot include OpencBot::Helpers::PersistenceHandler include OpencBot::Helpers::PsuedoMachineRegisterMethods diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index aa2e1d6..70db8d2 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -29,7 +29,7 @@ def klass_from_file_name(underscore_file_name) end def db_name - "#{bot_name.downcase}.db" + "#{bot_name.gsub('_', '').downcase}.db" end def processing_states From 6d4acbcb30f6246edc0570e7055fe6c65734aafb Mon Sep 17 00:00:00 2001 From: Shyam Peri Date: Wed, 7 Apr 2021 11:48:28 +0530 Subject: [PATCH 09/30] remove the todo rubocop file --- .rubocop_todo.yml | 302 ---------------------------------------------- 1 file changed, 302 deletions(-) delete mode 100644 .rubocop_todo.yml diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml deleted file mode 100644 index 5857242..0000000 --- a/.rubocop_todo.yml +++ /dev/null @@ -1,302 +0,0 @@ -# This configuration was generated by -# `rubocop --auto-gen-config --exclude-limit 999 --no-offense-counts --no-auto-gen-timestamp` -# using RuboCop version 0.93.1. -# The point is for the user to remove these configuration records -# one by one as the offenses are removed from the code base. -# Note that changes in the inspected code, or installation of new -# versions of RuboCop, may require this file to be generated again. - -# Configuration parameters: Include. -# Include: **/*.gemspec -Gemspec/RequiredRubyVersion: - Exclude: - - 'openc_bot.gemspec' - -# Cop supports --auto-correct. -# Configuration parameters: EnforcedStyleAlignWith, Severity. -# SupportedStylesAlignWith: start_of_line, begin -Layout/BeginEndAlignment: - Exclude: - - 'examples/company_fetchers/basic.rb' - -# Cop supports --auto-correct. -Layout/RescueEnsureAlignment: - Exclude: - - 'examples/company_fetchers/basic.rb' - -# Configuration parameters: AllowSafeAssignment. -Lint/AssignmentInCondition: - Exclude: - - 'lib/openc_bot/helpers/incremental_search.rb' - - 'lib/openc_bot/helpers/register_methods.rb' - -Lint/RescueException: - Exclude: - - 'lib/openc_bot/bot_data_validator.rb' - - 'lib/openc_bot/company_fetcher_bot.rb' - - 'lib/openc_bot/helpers/alpha_search.rb' - - 'lib/openc_bot/helpers/register_methods.rb' - - 'lib/openc_bot/jobs/sru_request_job.rb' - - 'lib/openc_bot/tasks.rb' - -Lint/ShadowedException: - Exclude: - - 'lib/openc_bot/helpers/alpha_search.rb' - -# Configuration parameters: AllowComments. -Lint/SuppressedException: - Exclude: - - 'lib/openc_bot/incrementers/base.rb' - -Lint/UselessAssignment: - Exclude: - - 'lib/openc_bot/bot_data_validator.rb' - - 'lib/openc_bot/helpers/alpha_search.rb' - - 'lib/openc_bot/helpers/incremental_search.rb' - - 'lib/openc_bot/tasks.rb' - - 'spec/lib/openc_bot/incrementers/common_spec.rb' - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: IgnoredMethods. -Metrics/AbcSize: - Max: 33 - -# Configuration parameters: IgnoredMethods. -Metrics/CyclomaticComplexity: - Max: 13 - -# Configuration parameters: CountComments, CountAsOne, ExcludedMethods. -Metrics/MethodLength: - Max: 27 - -# Configuration parameters: CountKeywordArgs. -Metrics/ParameterLists: - Max: 6 - -# Configuration parameters: IgnoredMethods. -Metrics/PerceivedComplexity: - Max: 14 - -Naming/AccessorMethodName: - Exclude: - - 'lib/openc_bot/tasks.rb' - -# Configuration parameters: ExpectMatchingDefinition, CheckDefinitionPathHierarchy, Regex, IgnoreExecutableScripts, AllowedAcronyms. -# AllowedAcronyms: CLI, DSL, ACL, API, ASCII, CPU, CSS, DNS, EOF, GUID, HTML, HTTP, HTTPS, ID, IP, JSON, LHS, QPS, RAM, RHS, RPC, SLA, SMTP, SQL, SSH, TCP, TLS, TTL, UDP, UI, UID, UUID, URI, URL, UTF8, VM, XML, XMPP, XSRF, XSS -Naming/FileName: - Exclude: - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: EnforcedStyle. -# SupportedStyles: snake_case, normalcase, non_integer -Naming/VariableNumber: - Exclude: - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: Prefixes. -# Prefixes: when, with, without -RSpec/ContextWording: - Exclude: - - 'spec/lib/company_fetcher_bot_spec.rb' - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: IgnoredMetadata. -RSpec/DescribeClass: - Exclude: - - 'spec/lib/company_fetcher_bot_spec.rb' - - 'spec/lib/exceptions_spec.rb' - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: Max. -RSpec/ExampleLength: - Exclude: - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/dates_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/lib/openc_bot/helpers/reporting_spec.rb' - - 'spec/lib/openc_bot/helpers/text_spec.rb' - - 'spec/lib/openc_bot/incrementers/common_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - - 'spec/schemas/company-schema_spec.rb' - -RSpec/ExpectInHook: - Exclude: - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - -# Configuration parameters: CustomTransform, IgnoreMethods, SpecSuffixOnly. -RSpec/FilePath: - Exclude: - - 'lib/openc_bot/templates/spec/bot_spec.rb' - - 'spec/lib/bot_data_validator_spec.rb' - - 'spec/lib/openc_bot/incrementers/common_spec.rb' - -# Configuration parameters: AssignmentOnly. -RSpec/InstanceVariable: - Exclude: - - 'spec/lib/bot_data_validator_spec.rb' - - 'spec/lib/company_fetcher_bot_spec.rb' - - 'spec/lib/exceptions_spec.rb' - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/lib/openc_bot/incrementers/common_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: EnforcedStyle. -# SupportedStyles: have_received, receive -RSpec/MessageSpies: - Exclude: - - 'spec/lib/company_fetcher_bot_spec.rb' - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/lib/openc_bot/helpers/reporting_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - -RSpec/MultipleExpectations: - Max: 48 - -RSpec/NestedGroups: - Max: 5 - -RSpec/StubbedMock: - Exclude: - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - -# Configuration parameters: IgnoreNameless, IgnoreSymbolicNames. -RSpec/VerifiedDoubles: - Exclude: - - 'spec/lib/company_fetcher_bot_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - -Security/Open: - Exclude: - - 'lib/openc_bot/helpers/alpha_search.rb' - -# Configuration parameters: AllowedChars. -Style/AsciiComments: - Exclude: - - 'lib/openc_bot/templates/lib/company_fetcher_bot.rb' - -# Cop supports --auto-correct. -# Configuration parameters: EnforcedStyle. -# SupportedStyles: nested, compact -Style/ClassAndModuleChildren: - Exclude: - - 'lib/monkey_patches/mechanize.rb' - -Style/ClassVars: - Exclude: - - 'lib/openc_bot.rb' - -Style/Documentation: - Exclude: - - 'spec/**/*' - - 'test/**/*' - - 'examples/company_fetchers/basic.rb' - - 'lib/monkey_patches/mechanize.rb' - - 'lib/openc_bot.rb' - - 'lib/openc_bot/bot_data_validator.rb' - - 'lib/openc_bot/company_fetcher_bot.rb' - - 'lib/openc_bot/helpers/alpha_search.rb' - - 'lib/openc_bot/helpers/dates.rb' - - 'lib/openc_bot/helpers/incremental_search.rb' - - 'lib/openc_bot/helpers/register_methods.rb' - - 'lib/openc_bot/helpers/text.rb' - - 'lib/openc_bot/incrementers/base.rb' - - 'lib/openc_bot/incrementers/common.rb' - - 'lib/openc_bot/templates/lib/bot.rb' - - 'lib/openc_bot/templates/lib/company_fetcher_bot.rb' - -# Configuration parameters: EnforcedStyle. -# SupportedStyles: allowed_in_returns, forbidden -Style/DoubleNegation: - Exclude: - - 'lib/openc_bot/helpers/register_methods.rb' - -# Cop supports --auto-correct. -Style/GlobalStdStream: - Exclude: - - 'lib/openc_bot/company_fetcher_bot.rb' - -# Configuration parameters: MinBodyLength. -Style/GuardClause: - Exclude: - - 'lib/openc_bot/helpers/register_methods.rb' - -# Configuration parameters: AllowIfModifier. -Style/IfInsideElse: - Exclude: - - 'lib/openc_bot/helpers/register_methods.rb' - -# Configuration parameters: AllowedMethods. -# AllowedMethods: respond_to_missing? -Style/OptionalBooleanParameter: - Exclude: - - 'lib/openc_bot/helpers/dates.rb' - - 'lib/openc_bot/helpers/incremental_search.rb' - - 'lib/openc_bot/helpers/register_methods.rb' - -# Cop supports --auto-correct. -Style/RedundantAssignment: - Exclude: - - 'lib/openc_bot/tasks.rb' - -# Cop supports --auto-correct. -Style/RedundantRegexpCharacterClass: - Exclude: - - 'lib/openc_bot/helpers/dates.rb' - -# Cop supports --auto-correct. -Style/RedundantRegexpEscape: - Exclude: - - 'lib/openc_bot/helpers/register_methods.rb' - -# Cop supports --auto-correct. -# Configuration parameters: EnforcedStyle, AllowInnerSlashes. -# SupportedStyles: slashes, percent_r, mixed -Style/RegexpLiteral: - Exclude: - - 'lib/openc_bot/helpers/alpha_search.rb' - - 'lib/openc_bot/helpers/dates.rb' - -# Cop supports --auto-correct. -# Configuration parameters: AllowAsExpressionSeparator. -Style/Semicolon: - Exclude: - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - -# Cop supports --auto-correct. -Style/SlicingWithRange: - Exclude: - - 'lib/openc_bot/helpers/alpha_search.rb' - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - -# Cop supports --auto-correct. -Style/StringConcatenation: - Exclude: - - 'Rakefile' - - 'examples/company_fetchers/basic.rb' - - 'lib/openc_bot.rb' - - 'lib/openc_bot/exceptions.rb' - - 'lib/openc_bot/incrementers/base.rb' - - 'lib/openc_bot/tasks.rb' - - 'openc_bot.gemspec' - -# Cop supports --auto-correct. -# Configuration parameters: AutoCorrect, AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns. -# URISchemes: http, https -Layout/LineLength: - Max: 242 From fc7f48d2f4fba4246d73e260689efbe6cb094a2c Mon Sep 17 00:00:00 2001 From: Shyam Peri Date: Thu, 8 Apr 2021 13:48:22 +0530 Subject: [PATCH 10/30] change the default acquisition directory (create one if not exist) --- lib/openc_bot/helpers/persistence_handler.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index 09a9ff7..43387a6 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -13,7 +13,9 @@ def output_stream end def acquisition_base_directory - ENV["ACQUISITION_BASE_DIRECTORY"] || "data" + dir = ENV.fetch("ACQUISITION_BASE_DIRECTORY", "data/acquisition") + Dir.mkdir(dir) unless Dir.exist?(dir) + dir end def acquisition_id From c747f73792999bb54bdc45924ce01921bf952157 Mon Sep 17 00:00:00 2001 From: Shyam Peri Date: Thu, 8 Apr 2021 15:56:25 +0530 Subject: [PATCH 11/30] pluralize folder name --- lib/openc_bot/helpers/persistence_handler.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index 43387a6..9c0eb3d 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -13,7 +13,7 @@ def output_stream end def acquisition_base_directory - dir = ENV.fetch("ACQUISITION_BASE_DIRECTORY", "data/acquisition") + dir = ENV.fetch("ACQUISITION_BASE_DIRECTORY", "data/acquisitions") Dir.mkdir(dir) unless Dir.exist?(dir) dir end From 6d442ceb3a1002983b1f5fbecfa10f830b8b194a Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Sat, 10 Apr 2021 19:01:24 +0100 Subject: [PATCH 12/30] Avoid creating an acquisition folder if there is no new data to fetch --- lib/openc_bot/helpers/persistence_handler.rb | 4 ++++ lib/openc_bot/helpers/pseudo_machine_transformer.rb | 4 +++- lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 11 ++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index 09a9ff7..936d28f 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -86,6 +86,10 @@ def mark_acquisition_directory_as_finished_processing def mark_finished_acquisition_directory_as_processing(processing_directory) File.rename(acquisition_directory_final, processing_directory) end + + def remove_current_processing_acquisition_directory + FileUtils.rm_rf(acquisition_directory_processing) + end end end end diff --git a/lib/openc_bot/helpers/pseudo_machine_transformer.rb b/lib/openc_bot/helpers/pseudo_machine_transformer.rb index ef1839c..bec5041 100644 --- a/lib/openc_bot/helpers/pseudo_machine_transformer.rb +++ b/lib/openc_bot/helpers/pseudo_machine_transformer.rb @@ -35,7 +35,9 @@ def run counter += 1 end end - { transformed: counter, transformer_start: start_time, transformer_end: Time.now.utc } + res = { transformed: counter, transformer_start: start_time, transformer_end: Time.now.utc } + res.merge!({ no_transformed_data: true }) if counter == 0 + res end def schema_name diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index aa2e1d6..f02bfe5 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -59,9 +59,14 @@ def update_data(options = {}) res.merge!(bot_namespace::Transformer.run) processing_states << "transformer" end - res[:data_directory] = acquisition_directory_final - # rename directory so it will be seen by importer - mark_acquisition_directory_as_finished_processing + if res[:no_transformed_data] + # we don't need to keep empty acquisitions + remove_current_processing_acquisition_directory + else + res[:data_directory] = acquisition_directory_final + # rename directory so it will be seen by importer + mark_acquisition_directory_as_finished_processing + end raise "\n#{JSON.pretty_generate(res)}" if res.key?(:fetch_data_error) || res.key?(:update_stale_error) res From 6bf0d4a9c0c6595932ee0a86d5b4b9ef8860981f Mon Sep 17 00:00:00 2001 From: Shyam Peri Date: Thu, 15 Apr 2021 22:05:15 +0530 Subject: [PATCH 13/30] persist only when directory exist --- lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index a27e3f5..51000be 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -74,7 +74,7 @@ def update_data(options = {}) send_error_report(e, options) raise e ensure - IO.write("#{acquisition_directory}/processing_states.json", processing_states.to_json) + IO.write("#{acquisition_directory}/processing_states.json", processing_states.to_json) if Dir.exist?(acquisition_directory) end end end From 401859c4e07f8b732673c55ce5fd4155d2052b1e Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Tue, 20 Apr 2021 09:58:01 +0100 Subject: [PATCH 14/30] Fix stat reporting for pseudo machine bots --- lib/openc_bot/helpers/persistence_handler.rb | 1 + .../pseudo_machine_company_fetcher_bot.rb | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index 936d28f..893ad43 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -71,6 +71,7 @@ def persist(res) File.open(output_file_location, "a") do |f| f.puts res.to_json end + track_company_processed end def acquisition_directory diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index f02bfe5..4e5adc2 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -32,6 +32,26 @@ def db_name "#{bot_name.downcase}.db" end + def statsd_namespace + @statsd_namespace ||= begin + bot_env = ENV.fetch("FETCHER_BOT_ENV", "development").to_sym + StatsD.mode = bot_env + StatsD.server = "sys1:8125" + StatsD.logger = Logger.new("/dev/null") if bot_env != :production + + if respond_to?(:output_stream) + if respond_to?(:inferred_jurisdiction_code) && inferred_jurisdiction_code + "pseudo_machine_bot.#{bot_env}.#{inferred_jurisdiction_code}.#{output_stream}" + elsif is_a?(Module) + "pseudo_machine_bot.#{bot_env}.#{name.downcase}.#{output_stream}" + else + "pseudo_machine_bot.#{bot_env}.#{self.class.name.downcase}.#{output_stream}" + end + .sub("companiesfetcher", "") + end + end + end + def processing_states return @processing_states unless @processing_states.nil? From 9c47433f4a04f0c46ad9eb656a44415436181128 Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Tue, 20 Apr 2021 15:24:02 +0100 Subject: [PATCH 15/30] Include reporting module in Persistence handler --- lib/openc_bot/helpers/persistence_handler.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index a00d9bd..50619d4 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -1,9 +1,13 @@ # frozen_string_literal: true +require "openc_bot/helpers/reporting" + module OpencBot module Helpers # Persistence handler for bot activities module PersistenceHandler + include OpencBot::Helpers::Reporting + def input_stream # override in segment bots end From cd6b8e1fc471b5fbbb1a5905613aa9f6a93b766a Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Tue, 20 Apr 2021 16:49:14 +0100 Subject: [PATCH 16/30] Fix parser dependencies --- lib/openc_bot/helpers/persistence_handler.rb | 4 ---- lib/openc_bot/helpers/pseudo_machine_parser.rb | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index 50619d4..a00d9bd 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -1,13 +1,9 @@ # frozen_string_literal: true -require "openc_bot/helpers/reporting" - module OpencBot module Helpers # Persistence handler for bot activities module PersistenceHandler - include OpencBot::Helpers::Reporting - def input_stream # override in segment bots end diff --git a/lib/openc_bot/helpers/pseudo_machine_parser.rb b/lib/openc_bot/helpers/pseudo_machine_parser.rb index 65e01bd..bbc5f31 100644 --- a/lib/openc_bot/helpers/pseudo_machine_parser.rb +++ b/lib/openc_bot/helpers/pseudo_machine_parser.rb @@ -1,12 +1,12 @@ # frozen_string_literal: true -require "openc_bot/helpers/persistence_handler" +require "openc_bot/pseudo_machine_company_fetcher_bot" module OpencBot module Helpers # Parsing activities module PseudoMachineParser - include OpencBot::Helpers::PersistenceHandler + include OpencBot::PseudoMachineCompanyFetcherBot def input_stream "fetcher" From 986dcdedb5987f7c52a74f21fb6f1ec32dfa33c0 Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Tue, 20 Apr 2021 19:42:14 +0100 Subject: [PATCH 17/30] debugging --- lib/openc_bot/helpers/persistence_handler.rb | 1 + lib/openc_bot/helpers/reporting.rb | 1 + lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 7 +++++++ 3 files changed, 9 insertions(+) diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index a00d9bd..9596c69 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -73,6 +73,7 @@ def persist(res) File.open(output_file_location, "a") do |f| f.puts res.to_json end + puts "persisted" track_company_processed end diff --git a/lib/openc_bot/helpers/reporting.rb b/lib/openc_bot/helpers/reporting.rb index 407487f..fba2342 100644 --- a/lib/openc_bot/helpers/reporting.rb +++ b/lib/openc_bot/helpers/reporting.rb @@ -73,6 +73,7 @@ def report_run_to_analysis_app(params) alias report_run_to_oc report_run_to_analysis_app def track_company_processed + puts "Company about to be reported to statsD" StatsD.increment("#{statsd_namespace}.processed", sample_rate: 1.0) increment_progress_counters(companies_processed_delta: 1) diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index 0f75c3b..fd1c2ab 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -33,19 +33,26 @@ def db_name end def statsd_namespace + puts "Getting statsD namespace" @statsd_namespace ||= begin bot_env = ENV.fetch("FETCHER_BOT_ENV", "development").to_sym StatsD.mode = bot_env StatsD.server = "sys1:8125" StatsD.logger = Logger.new("/dev/null") if bot_env != :production + puts "bot_env is #{bot_env}" + if respond_to?(:output_stream) + puts "Responds to output_stream" if respond_to?(:inferred_jurisdiction_code) && inferred_jurisdiction_code "pseudo_machine_bot.#{bot_env}.#{inferred_jurisdiction_code}.#{output_stream}" + puts "Responds to inferred_jur_code" elsif is_a?(Module) "pseudo_machine_bot.#{bot_env}.#{name.downcase}.#{output_stream}" + puts "is a Module" else "pseudo_machine_bot.#{bot_env}.#{self.class.name.downcase}.#{output_stream}" + puts "in the else clause of statsd namespace" end .sub("companiesfetcher", "") end From 45ced318598667445e939e683cc40551db4e051a Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Tue, 20 Apr 2021 20:27:29 +0100 Subject: [PATCH 18/30] Fix reporting --- lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index fd1c2ab..db25e14 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -45,14 +45,14 @@ def statsd_namespace if respond_to?(:output_stream) puts "Responds to output_stream" if respond_to?(:inferred_jurisdiction_code) && inferred_jurisdiction_code - "pseudo_machine_bot.#{bot_env}.#{inferred_jurisdiction_code}.#{output_stream}" puts "Responds to inferred_jur_code" + "pseudo_machine_bot.#{bot_env}.#{inferred_jurisdiction_code}.#{output_stream}" elsif is_a?(Module) - "pseudo_machine_bot.#{bot_env}.#{name.downcase}.#{output_stream}" puts "is a Module" + "pseudo_machine_bot.#{bot_env}.#{name.downcase}.#{output_stream}" else - "pseudo_machine_bot.#{bot_env}.#{self.class.name.downcase}.#{output_stream}" puts "in the else clause of statsd namespace" + "pseudo_machine_bot.#{bot_env}.#{self.class.name.downcase}.#{output_stream}" end .sub("companiesfetcher", "") end From 43638ea5295c86af761ae67cfa185943895fcaf1 Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Tue, 20 Apr 2021 23:13:27 +0100 Subject: [PATCH 19/30] More StatsD logging --- lib/openc_bot/helpers/reporting.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/openc_bot/helpers/reporting.rb b/lib/openc_bot/helpers/reporting.rb index fba2342..bc3f819 100644 --- a/lib/openc_bot/helpers/reporting.rb +++ b/lib/openc_bot/helpers/reporting.rb @@ -75,6 +75,8 @@ def report_run_to_analysis_app(params) def track_company_processed puts "Company about to be reported to statsD" StatsD.increment("#{statsd_namespace}.processed", sample_rate: 1.0) + puts "StatsD, #{StatsD.enabled}, #{StatsD.host}, #{StatsD.port}" + puts "incremented, #{stastd_namespace}" increment_progress_counters(companies_processed_delta: 1) From 4af1c90e8b88a5e3ac16b7e49f80fbbe8ea72f58 Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Tue, 20 Apr 2021 23:56:59 +0100 Subject: [PATCH 20/30] Fix logging bug --- lib/openc_bot/helpers/reporting.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openc_bot/helpers/reporting.rb b/lib/openc_bot/helpers/reporting.rb index bc3f819..49c4c62 100644 --- a/lib/openc_bot/helpers/reporting.rb +++ b/lib/openc_bot/helpers/reporting.rb @@ -76,7 +76,7 @@ def track_company_processed puts "Company about to be reported to statsD" StatsD.increment("#{statsd_namespace}.processed", sample_rate: 1.0) puts "StatsD, #{StatsD.enabled}, #{StatsD.host}, #{StatsD.port}" - puts "incremented, #{stastd_namespace}" + puts "incremented, #{statsd_namespace}" increment_progress_counters(companies_processed_delta: 1) From 53f512d39218f2c95c25a6cf2422884786d0f52d Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Wed, 21 Apr 2021 09:46:28 +0100 Subject: [PATCH 21/30] Fix Statsd namespace for new bots --- lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index db25e14..eadb572 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -46,15 +46,15 @@ def statsd_namespace puts "Responds to output_stream" if respond_to?(:inferred_jurisdiction_code) && inferred_jurisdiction_code puts "Responds to inferred_jur_code" - "pseudo_machine_bot.#{bot_env}.#{inferred_jurisdiction_code}.#{output_stream}" + "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{inferred_jurisdiction_code}" elsif is_a?(Module) puts "is a Module" - "pseudo_machine_bot.#{bot_env}.#{name.downcase}.#{output_stream}" + "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{name.downcase}" else puts "in the else clause of statsd namespace" - "pseudo_machine_bot.#{bot_env}.#{self.class.name.downcase}.#{output_stream}" + "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{self.class.name.downcase}" end - .sub("companiesfetcher", "") + .sub("companiesfetcher", "").sub(/::.*/, "") end end end From ca8b56625330ff4cf7bfbb5e902c50fd360dae53 Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Wed, 21 Apr 2021 15:51:10 +0100 Subject: [PATCH 22/30] Implement better jurisdiction formatting for stat collection --- lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index eadb572..940cf5e 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -49,7 +49,7 @@ def statsd_namespace "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{inferred_jurisdiction_code}" elsif is_a?(Module) puts "is a Module" - "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{name.downcase}" + "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{name.downcase.chars.each_slice(2).map(&:join).join('_')}" else puts "in the else clause of statsd namespace" "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{self.class.name.downcase}" From 3217f223b1cd91e9f7ca190d286f6361b12d9253 Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Wed, 21 Apr 2021 16:28:52 +0100 Subject: [PATCH 23/30] Fix jurisdiction formatting for stat collecting --- lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index 940cf5e..4bfd242 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -49,7 +49,8 @@ def statsd_namespace "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{inferred_jurisdiction_code}" elsif is_a?(Module) puts "is a Module" - "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{name.downcase.chars.each_slice(2).map(&:join).join('_')}" + jur_name = name.downcase.sub("companiesfetcher").sub(/::.*/, "") + "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{jur_name.chars.each_slice(2).map(&:join).join('_')}" else puts "in the else clause of statsd namespace" "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{self.class.name.downcase}" From 626f79fc618f7fa28b650b48f8b98e9f8ac499e1 Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Wed, 21 Apr 2021 17:01:53 +0100 Subject: [PATCH 24/30] Bug fix --- lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index 4bfd242..5b21bc0 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -49,7 +49,7 @@ def statsd_namespace "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{inferred_jurisdiction_code}" elsif is_a?(Module) puts "is a Module" - jur_name = name.downcase.sub("companiesfetcher").sub(/::.*/, "") + jur_name = name.downcase.sub("companiesfetcher", "").sub(/::.*/, "") "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{jur_name.chars.each_slice(2).map(&:join).join('_')}" else puts "in the else clause of statsd namespace" From dae48c144f53c900783425080110f5626b114bfb Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Fri, 23 Apr 2021 12:48:52 +0100 Subject: [PATCH 25/30] Replace hardcoded path with data_dir definition --- lib/openc_bot/helpers/persistence_handler.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index 9596c69..e468345 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -13,7 +13,7 @@ def output_stream end def acquisition_base_directory - dir = ENV.fetch("ACQUISITION_BASE_DIRECTORY", "data/acquisitions") + dir = ENV.fetch("ACQUISITION_BASE_DIRECTORY", File.join(data_dir, "acquisitions")) Dir.mkdir(dir) unless Dir.exist?(dir) dir end From 32d797f6da28e6a526a4e19f6654cef815e9766d Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Fri, 23 Apr 2021 12:40:50 +0100 Subject: [PATCH 26/30] Define separate weekend hours --- lib/openc_bot/helpers/register_methods.rb | 19 ++++++++++++++++--- .../helpers/register_methods_spec.rb | 8 +++++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/lib/openc_bot/helpers/register_methods.rb b/lib/openc_bot/helpers/register_methods.rb index 56daab7..e10e70f 100644 --- a/lib/openc_bot/helpers/register_methods.rb +++ b/lib/openc_bot/helpers/register_methods.rb @@ -30,6 +30,16 @@ def allowed_hours end end + def allowed_weekend_hours + if const_defined?("ALLOWED_WEEKEND_HOURS") + const_get("ALLOWED_WEEKEND_HOURS").to_a + elsif const_defined?("TIMEZONE") + # See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones for definitions/examples + # eg TIMEZONE = "America/Panama" + (0..10).to_a + (12..22).to_a + end + end + def use_alpha_search const_defined?("USE_ALPHA_SEARCH") && const_get("USE_ALPHA_SEARCH") end @@ -101,8 +111,11 @@ def fetch_registry_page(company_number, options = {}) def in_prohibited_time? current_time = current_time_in_zone - - allowed_hours && !allowed_hours.include?(current_time.hour) && !current_time.saturday? && !current_time.sunday? + if current_time.saturday? || current_time.sunday? + allowed_weekend_hours && !allowed_weekend_hours.include?(current_time.hour) + else + allowed_hours && !allowed_hours.include?(current_time.hour) + end end def prepare_and_save_data(all_data, _options = {}) @@ -382,7 +395,7 @@ def _client(options = {}) end def _http_get(url, options = {}) - raise OutOfPermittedHours, "Request at #{Time.now} is not out business hours (#{allowed_hours})" if options[:restrict_to_out_of_hours] && in_prohibited_time? + raise OutOfPermittedHours, "Request at #{Time.now} is not out business hours (Weekdays: #{allowed_hours}, Weekends: #{allowed_weekend_hours})" if options[:restrict_to_out_of_hours] && in_prohibited_time? _client(options).get_content(url) end diff --git a/spec/lib/openc_bot/helpers/register_methods_spec.rb b/spec/lib/openc_bot/helpers/register_methods_spec.rb index e410078..3687c07 100644 --- a/spec/lib/openc_bot/helpers/register_methods_spec.rb +++ b/spec/lib/openc_bot/helpers/register_methods_spec.rb @@ -1030,13 +1030,15 @@ module ModuleWithNoCustomPrimaryKey describe "in_prohibited_time?" do before do - allow(ModuleThatIncludesRegisterMethods).to receive(:allowed_hours).and_return((0..12)) + allow(ModuleThatIncludesRegisterMethods).to receive(:allowed_hours).and_return((0..12).to_a) + allow(ModuleThatIncludesRegisterMethods).to receive(:allowed_weekend_hours).and_return((0..10).to_a + (12..22).to_a) end it "returns true only if current_time_in_zone out of office hours" do times_and_truthiness = { "2014-10-09 04:14:25 +0100" => false, # weekday out of hours - "2014-10-11 15:14:25 +0100" => false, # in weekend + "2014-10-11 11:14:25 +0100" => true, # in weekend, out of ALLOWED weekend hours + "2014-10-11 15:14:25 +0100" => false, # in weekend, within ALLOWED weekend hours "2014-10-10 15:14:25 +0100" => true, # weekday in business hours } times_and_truthiness.each do |datetime, truthiness| @@ -1045,7 +1047,7 @@ module ModuleWithNoCustomPrimaryKey end end - it "returns false if allowed_hours not defined" do + it "returns nil if allowed_hours not defined" do expect(ModuleWithNoCustomPrimaryKey.in_prohibited_time?).to be_nil end end From a22e0b8d6d80c23a1b6e7fb0d46db4087916cec2 Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Fri, 23 Apr 2021 16:46:11 +0100 Subject: [PATCH 27/30] Remove stat debugging messages --- lib/openc_bot/helpers/persistence_handler.rb | 1 - lib/openc_bot/helpers/reporting.rb | 3 --- lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 6 ------ 3 files changed, 10 deletions(-) diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index e468345..178f866 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -73,7 +73,6 @@ def persist(res) File.open(output_file_location, "a") do |f| f.puts res.to_json end - puts "persisted" track_company_processed end diff --git a/lib/openc_bot/helpers/reporting.rb b/lib/openc_bot/helpers/reporting.rb index 49c4c62..407487f 100644 --- a/lib/openc_bot/helpers/reporting.rb +++ b/lib/openc_bot/helpers/reporting.rb @@ -73,10 +73,7 @@ def report_run_to_analysis_app(params) alias report_run_to_oc report_run_to_analysis_app def track_company_processed - puts "Company about to be reported to statsD" StatsD.increment("#{statsd_namespace}.processed", sample_rate: 1.0) - puts "StatsD, #{StatsD.enabled}, #{StatsD.host}, #{StatsD.port}" - puts "incremented, #{statsd_namespace}" increment_progress_counters(companies_processed_delta: 1) diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index 5b21bc0..f2cc614 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -40,19 +40,13 @@ def statsd_namespace StatsD.server = "sys1:8125" StatsD.logger = Logger.new("/dev/null") if bot_env != :production - puts "bot_env is #{bot_env}" - if respond_to?(:output_stream) - puts "Responds to output_stream" if respond_to?(:inferred_jurisdiction_code) && inferred_jurisdiction_code - puts "Responds to inferred_jur_code" "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{inferred_jurisdiction_code}" elsif is_a?(Module) - puts "is a Module" jur_name = name.downcase.sub("companiesfetcher", "").sub(/::.*/, "") "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{jur_name.chars.each_slice(2).map(&:join).join('_')}" else - puts "in the else clause of statsd namespace" "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{self.class.name.downcase}" end .sub("companiesfetcher", "").sub(/::.*/, "") From d51358709e74dbe7380e118c6e6c64364d76ae12 Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Tue, 4 May 2021 13:36:45 +0100 Subject: [PATCH 28/30] Remove redundant StatsD logging --- lib/openc_bot/pseudo_machine_company_fetcher_bot.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb index f2cc614..0fb138c 100644 --- a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -33,7 +33,6 @@ def db_name end def statsd_namespace - puts "Getting statsD namespace" @statsd_namespace ||= begin bot_env = ENV.fetch("FETCHER_BOT_ENV", "development").to_sym StatsD.mode = bot_env From bc6d4645bdd152c72c7c58318759cca7c33988bb Mon Sep 17 00:00:00 2001 From: Ivan Bashkirov Date: Fri, 4 Jun 2021 11:37:20 +0100 Subject: [PATCH 29/30] Allow external bots persisting parsed data record-by-record by yielding a block --- .../helpers/pseudo_machine_parser.rb | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/lib/openc_bot/helpers/pseudo_machine_parser.rb b/lib/openc_bot/helpers/pseudo_machine_parser.rb index bbc5f31..b7af55d 100644 --- a/lib/openc_bot/helpers/pseudo_machine_parser.rb +++ b/lib/openc_bot/helpers/pseudo_machine_parser.rb @@ -14,22 +14,35 @@ def input_stream def parse(fetched_datum) # implement in bot - # may return either a single parsed datum or an array of data from - # single fetched_datum, e.g. if fetched_datum is a CSV file + # may return either: + # a single parsed datum + # an array of data from single fetched_datum, e.g. if fetched_datum is a CSV file + # call "yield(parsed_datum)" inside the parse method passing each parsed record, to persist one-by-one end def run start_time = Time.now.utc counter = 0 input_data do |fetched_datum| - parsed_data = parse(fetched_datum) - parsed_data = [parsed_data] unless parsed_data.is_a?(Array) - parsed_data.each do |parsed_datum| + yielded = false + # the parse method can use yield + parsed_data = parse(fetched_datum) do |parsed_datum| + yielded = true next if parsed_datum.blank? persist(parsed_datum) counter += 1 end + + unless yielded + parsed_data = [parsed_data] unless parsed_data.is_a?(Array) + parsed_data.each do |parsed_datum| + next if parsed_datum.blank? + + persist(parsed_datum) + counter += 1 + end + end end { parsed: counter, parser_start: start_time, parser_end: Time.now.utc } end From 9a5c2ad522e6a92699a64fbe23d59b4376c44bac Mon Sep 17 00:00:00 2001 From: mpbialoszewski Date: Tue, 8 Jun 2021 16:20:45 +0100 Subject: [PATCH 30/30] Change error login for input_data method --- lib/openc_bot/helpers/persistence_handler.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb index 178f866..0a69ba6 100644 --- a/lib/openc_bot/helpers/persistence_handler.rb +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -64,8 +64,9 @@ def input_data File.foreach(input_file_location) do |line| yield JSON.parse(line) end - rescue Errno::ENOENT - warn "No such file: #{input_file_location} present" + rescue Errno::ENOENT => e + warn "Error raised while processing the file: #{input_file_location}" + warn "Requested file not found: #{e.message}" [] end