diff --git a/.gitmodules b/.gitmodules index fab109b..9f5657f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "schemas"] path = schemas url = git://github.com/openc/openc-schema.git + branch = tighten-additional-properties diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml deleted file mode 100644 index 5857242..0000000 --- a/.rubocop_todo.yml +++ /dev/null @@ -1,302 +0,0 @@ -# This configuration was generated by -# `rubocop --auto-gen-config --exclude-limit 999 --no-offense-counts --no-auto-gen-timestamp` -# using RuboCop version 0.93.1. -# The point is for the user to remove these configuration records -# one by one as the offenses are removed from the code base. -# Note that changes in the inspected code, or installation of new -# versions of RuboCop, may require this file to be generated again. - -# Configuration parameters: Include. -# Include: **/*.gemspec -Gemspec/RequiredRubyVersion: - Exclude: - - 'openc_bot.gemspec' - -# Cop supports --auto-correct. -# Configuration parameters: EnforcedStyleAlignWith, Severity. -# SupportedStylesAlignWith: start_of_line, begin -Layout/BeginEndAlignment: - Exclude: - - 'examples/company_fetchers/basic.rb' - -# Cop supports --auto-correct. -Layout/RescueEnsureAlignment: - Exclude: - - 'examples/company_fetchers/basic.rb' - -# Configuration parameters: AllowSafeAssignment. -Lint/AssignmentInCondition: - Exclude: - - 'lib/openc_bot/helpers/incremental_search.rb' - - 'lib/openc_bot/helpers/register_methods.rb' - -Lint/RescueException: - Exclude: - - 'lib/openc_bot/bot_data_validator.rb' - - 'lib/openc_bot/company_fetcher_bot.rb' - - 'lib/openc_bot/helpers/alpha_search.rb' - - 'lib/openc_bot/helpers/register_methods.rb' - - 'lib/openc_bot/jobs/sru_request_job.rb' - - 'lib/openc_bot/tasks.rb' - -Lint/ShadowedException: - Exclude: - - 'lib/openc_bot/helpers/alpha_search.rb' - -# Configuration parameters: AllowComments. -Lint/SuppressedException: - Exclude: - - 'lib/openc_bot/incrementers/base.rb' - -Lint/UselessAssignment: - Exclude: - - 'lib/openc_bot/bot_data_validator.rb' - - 'lib/openc_bot/helpers/alpha_search.rb' - - 'lib/openc_bot/helpers/incremental_search.rb' - - 'lib/openc_bot/tasks.rb' - - 'spec/lib/openc_bot/incrementers/common_spec.rb' - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: IgnoredMethods. -Metrics/AbcSize: - Max: 33 - -# Configuration parameters: IgnoredMethods. -Metrics/CyclomaticComplexity: - Max: 13 - -# Configuration parameters: CountComments, CountAsOne, ExcludedMethods. -Metrics/MethodLength: - Max: 27 - -# Configuration parameters: CountKeywordArgs. -Metrics/ParameterLists: - Max: 6 - -# Configuration parameters: IgnoredMethods. -Metrics/PerceivedComplexity: - Max: 14 - -Naming/AccessorMethodName: - Exclude: - - 'lib/openc_bot/tasks.rb' - -# Configuration parameters: ExpectMatchingDefinition, CheckDefinitionPathHierarchy, Regex, IgnoreExecutableScripts, AllowedAcronyms. -# AllowedAcronyms: CLI, DSL, ACL, API, ASCII, CPU, CSS, DNS, EOF, GUID, HTML, HTTP, HTTPS, ID, IP, JSON, LHS, QPS, RAM, RHS, RPC, SLA, SMTP, SQL, SSH, TCP, TLS, TTL, UDP, UI, UID, UUID, URI, URL, UTF8, VM, XML, XMPP, XSRF, XSS -Naming/FileName: - Exclude: - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: EnforcedStyle. -# SupportedStyles: snake_case, normalcase, non_integer -Naming/VariableNumber: - Exclude: - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: Prefixes. -# Prefixes: when, with, without -RSpec/ContextWording: - Exclude: - - 'spec/lib/company_fetcher_bot_spec.rb' - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: IgnoredMetadata. -RSpec/DescribeClass: - Exclude: - - 'spec/lib/company_fetcher_bot_spec.rb' - - 'spec/lib/exceptions_spec.rb' - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: Max. -RSpec/ExampleLength: - Exclude: - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/dates_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/lib/openc_bot/helpers/reporting_spec.rb' - - 'spec/lib/openc_bot/helpers/text_spec.rb' - - 'spec/lib/openc_bot/incrementers/common_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - - 'spec/schemas/company-schema_spec.rb' - -RSpec/ExpectInHook: - Exclude: - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - -# Configuration parameters: CustomTransform, IgnoreMethods, SpecSuffixOnly. -RSpec/FilePath: - Exclude: - - 'lib/openc_bot/templates/spec/bot_spec.rb' - - 'spec/lib/bot_data_validator_spec.rb' - - 'spec/lib/openc_bot/incrementers/common_spec.rb' - -# Configuration parameters: AssignmentOnly. -RSpec/InstanceVariable: - Exclude: - - 'spec/lib/bot_data_validator_spec.rb' - - 'spec/lib/company_fetcher_bot_spec.rb' - - 'spec/lib/exceptions_spec.rb' - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/lib/openc_bot/incrementers/common_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - - 'spec/schemas/company-schema_spec.rb' - -# Configuration parameters: EnforcedStyle. -# SupportedStyles: have_received, receive -RSpec/MessageSpies: - Exclude: - - 'spec/lib/company_fetcher_bot_spec.rb' - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/lib/openc_bot/helpers/reporting_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - -RSpec/MultipleExpectations: - Max: 48 - -RSpec/NestedGroups: - Max: 5 - -RSpec/StubbedMock: - Exclude: - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - - 'spec/lib/openc_bot/helpers/incremental_search_spec.rb' - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - -# Configuration parameters: IgnoreNameless, IgnoreSymbolicNames. -RSpec/VerifiedDoubles: - Exclude: - - 'spec/lib/company_fetcher_bot_spec.rb' - - 'spec/lib/openc_bot_spec.rb' - -Security/Open: - Exclude: - - 'lib/openc_bot/helpers/alpha_search.rb' - -# Configuration parameters: AllowedChars. -Style/AsciiComments: - Exclude: - - 'lib/openc_bot/templates/lib/company_fetcher_bot.rb' - -# Cop supports --auto-correct. -# Configuration parameters: EnforcedStyle. -# SupportedStyles: nested, compact -Style/ClassAndModuleChildren: - Exclude: - - 'lib/monkey_patches/mechanize.rb' - -Style/ClassVars: - Exclude: - - 'lib/openc_bot.rb' - -Style/Documentation: - Exclude: - - 'spec/**/*' - - 'test/**/*' - - 'examples/company_fetchers/basic.rb' - - 'lib/monkey_patches/mechanize.rb' - - 'lib/openc_bot.rb' - - 'lib/openc_bot/bot_data_validator.rb' - - 'lib/openc_bot/company_fetcher_bot.rb' - - 'lib/openc_bot/helpers/alpha_search.rb' - - 'lib/openc_bot/helpers/dates.rb' - - 'lib/openc_bot/helpers/incremental_search.rb' - - 'lib/openc_bot/helpers/register_methods.rb' - - 'lib/openc_bot/helpers/text.rb' - - 'lib/openc_bot/incrementers/base.rb' - - 'lib/openc_bot/incrementers/common.rb' - - 'lib/openc_bot/templates/lib/bot.rb' - - 'lib/openc_bot/templates/lib/company_fetcher_bot.rb' - -# Configuration parameters: EnforcedStyle. -# SupportedStyles: allowed_in_returns, forbidden -Style/DoubleNegation: - Exclude: - - 'lib/openc_bot/helpers/register_methods.rb' - -# Cop supports --auto-correct. -Style/GlobalStdStream: - Exclude: - - 'lib/openc_bot/company_fetcher_bot.rb' - -# Configuration parameters: MinBodyLength. -Style/GuardClause: - Exclude: - - 'lib/openc_bot/helpers/register_methods.rb' - -# Configuration parameters: AllowIfModifier. -Style/IfInsideElse: - Exclude: - - 'lib/openc_bot/helpers/register_methods.rb' - -# Configuration parameters: AllowedMethods. -# AllowedMethods: respond_to_missing? -Style/OptionalBooleanParameter: - Exclude: - - 'lib/openc_bot/helpers/dates.rb' - - 'lib/openc_bot/helpers/incremental_search.rb' - - 'lib/openc_bot/helpers/register_methods.rb' - -# Cop supports --auto-correct. -Style/RedundantAssignment: - Exclude: - - 'lib/openc_bot/tasks.rb' - -# Cop supports --auto-correct. -Style/RedundantRegexpCharacterClass: - Exclude: - - 'lib/openc_bot/helpers/dates.rb' - -# Cop supports --auto-correct. -Style/RedundantRegexpEscape: - Exclude: - - 'lib/openc_bot/helpers/register_methods.rb' - -# Cop supports --auto-correct. -# Configuration parameters: EnforcedStyle, AllowInnerSlashes. -# SupportedStyles: slashes, percent_r, mixed -Style/RegexpLiteral: - Exclude: - - 'lib/openc_bot/helpers/alpha_search.rb' - - 'lib/openc_bot/helpers/dates.rb' - -# Cop supports --auto-correct. -# Configuration parameters: AllowAsExpressionSeparator. -Style/Semicolon: - Exclude: - - 'spec/lib/openc_bot/helpers/register_methods_spec.rb' - -# Cop supports --auto-correct. -Style/SlicingWithRange: - Exclude: - - 'lib/openc_bot/helpers/alpha_search.rb' - - 'spec/lib/openc_bot/helpers/alpha_search_spec.rb' - -# Cop supports --auto-correct. -Style/StringConcatenation: - Exclude: - - 'Rakefile' - - 'examples/company_fetchers/basic.rb' - - 'lib/openc_bot.rb' - - 'lib/openc_bot/exceptions.rb' - - 'lib/openc_bot/incrementers/base.rb' - - 'lib/openc_bot/tasks.rb' - - 'openc_bot.gemspec' - -# Cop supports --auto-correct. -# Configuration parameters: AutoCorrect, AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns. -# URISchemes: http, https -Layout/LineLength: - Max: 242 diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 0000000..ec1cf33 --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +2.6.3 diff --git a/Gemfile.lock b/Gemfile.lock index e9c0866..d20ad01 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -34,48 +34,51 @@ GEM tzinfo (~> 1.1) addressable (2.7.0) public_suffix (>= 2.0.2, < 5.0) - ast (2.4.1) - backports (3.18.2) + ast (2.4.2) + backports (3.20.2) byebug (10.0.2) coderay (1.1.3) - concurrent-ruby (1.1.7) - crack (0.4.4) + concurrent-ruby (1.1.8) + crack (0.4.5) + rexml diff-lcs (1.4.4) hashdiff (1.0.1) httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) - json (2.3.1) + json (2.5.1) json-schema (2.8.1) addressable (>= 2.4) mail (2.7.1) mini_mime (>= 0.1.1) method_source (1.0.0) mini_mime (1.0.2) - mini_portile2 (2.4.0) - minitest (5.14.2) + mini_portile2 (2.5.0) + minitest (5.14.4) mono_logger (1.1.0) multi_json (1.15.0) mustermann (1.1.1) ruby2_keywords (~> 0.0.1) - nokogiri (1.10.10) - mini_portile2 (~> 2.4.0) + nokogiri (1.11.2) + mini_portile2 (~> 2.5.0) + racc (~> 1.4) parallel (1.20.1) - parser (2.7.2.0) + parser (3.0.0.0) ast (~> 2.4.1) - pry (0.13.1) + pry (0.14.0) coderay (~> 1.1) method_source (~> 1.0) public_suffix (4.0.6) + racc (1.5.2) rack (2.2.3) rack-protection (2.1.0) rack rainbow (3.0.0) rake (12.3.3) redis (4.2.5) - redis-namespace (1.8.0) + redis-namespace (1.8.1) redis (>= 3.0.4) - regexp_parser (2.0.0) + regexp_parser (2.1.1) resque (2.0.0) mono_logger (~> 1.0) multi_json (~> 1.0) @@ -88,15 +91,15 @@ GEM rspec-core (~> 3.10.0) rspec-expectations (~> 3.10.0) rspec-mocks (~> 3.10.0) - rspec-core (3.10.0) + rspec-core (3.10.1) rspec-support (~> 3.10.0) - rspec-expectations (3.10.0) + rspec-expectations (3.10.1) diff-lcs (>= 1.2.0, < 2.0) rspec-support (~> 3.10.0) - rspec-mocks (3.10.0) + rspec-mocks (3.10.2) diff-lcs (>= 1.2.0, < 2.0) rspec-support (~> 3.10.0) - rspec-support (3.10.0) + rspec-support (3.10.2) rubocop (0.93.1) parallel (~> 1.10) parser (>= 2.7.1.5) @@ -106,13 +109,13 @@ GEM rubocop-ast (>= 0.6.0) ruby-progressbar (~> 1.7) unicode-display_width (>= 1.4.0, < 2.0) - rubocop-ast (1.3.0) + rubocop-ast (1.4.1) parser (>= 2.7.1.5) rubocop-rspec (1.44.1) rubocop (~> 0.87) rubocop-ast (>= 0.7.1) - ruby-progressbar (1.10.1) - ruby2_keywords (0.0.2) + ruby-progressbar (1.11.0) + ruby2_keywords (0.0.4) scraperwiki (3.0.2) httpclient sqlite_magic @@ -125,7 +128,7 @@ GEM statsd-instrument (1.7.2) thread_safe (0.3.6) tilt (2.0.10) - tzinfo (1.2.8) + tzinfo (1.2.9) thread_safe (~> 0.1) unicode-display_width (1.7.0) vegas (0.1.11) diff --git a/lib/openc_bot/company_fetcher_bot.rb b/lib/openc_bot/company_fetcher_bot.rb index 108b874..676e2e3 100644 --- a/lib/openc_bot/company_fetcher_bot.rb +++ b/lib/openc_bot/company_fetcher_bot.rb @@ -56,6 +56,7 @@ def run(options = {}) # we may get a string back, or something else update_data_results = { output: update_data_results.to_s } unless update_data_results.is_a?(Hash) report_run_results(update_data_results.merge(started_at: start_time, ended_at: Time.now, status_code: "1")) + update_data_results end def schema_name diff --git a/lib/openc_bot/helpers/persistence_handler.rb b/lib/openc_bot/helpers/persistence_handler.rb new file mode 100644 index 0000000..0a69ba6 --- /dev/null +++ b/lib/openc_bot/helpers/persistence_handler.rb @@ -0,0 +1,99 @@ +# frozen_string_literal: true + +module OpencBot + module Helpers + # Persistence handler for bot activities + module PersistenceHandler + def input_stream + # override in segment bots + end + + def output_stream + name.to_s[/[A-Z][a-z]+$/].downcase + end + + def acquisition_base_directory + dir = ENV.fetch("ACQUISITION_BASE_DIRECTORY", File.join(data_dir, "acquisitions")) + Dir.mkdir(dir) unless Dir.exist?(dir) + dir + end + + def acquisition_id + @acquisition_id ||= ENV["FORCE_NEW_ACQUISITION"].blank? ? ENV["ACQUISITION_ID"] || in_progress_acquisition_id || Time.now.to_i.to_s : Time.now.to_i.to_s + end + + # gets the most recent in progress acquisition id, based on in processing + # directories + def in_progress_acquisition_id + return @acquisition_id unless @acquisition_id.blank? + + in_progress_acquisitions = Dir.glob("#{acquisition_base_directory}/*_processing").sort + in_progress_acquisitions.blank? ? nil : in_progress_acquisitions.last.split("/").last.sub("_processing", "") + end + + def input_file_location + File.join(acquisition_directory_processing, "#{input_stream}.jsonl") + end + + def output_file_location + File.join(acquisition_directory_processing, "#{output_stream}.jsonl") + end + + def acquisition_directory_processing + processing_directory = ENV["ACQUISITION_DIRECTORY"] || File.join(acquisition_base_directory, "#{acquisition_id}_processing") + unless Dir.exist?(processing_directory) + if ENV["ACQUISITION_ID"] + mark_finished_acquisition_directory_as_processing(processing_directory) + else + FileUtils.mkdir(processing_directory) + end + end + @acquisition_directory ||= processing_directory + processing_directory + end + + def acquisition_directory_final + @acquisition_directory = File.join(acquisition_base_directory, in_progress_acquisition_id) + end + + def records_processed + `wc -l "#{output_file_location}"`.strip.split[0].to_i + end + + def input_data + File.foreach(input_file_location) do |line| + yield JSON.parse(line) + end + rescue Errno::ENOENT => e + warn "Error raised while processing the file: #{input_file_location}" + warn "Requested file not found: #{e.message}" + [] + end + + def persist(res) + File.open(output_file_location, "a") do |f| + f.puts res.to_json + end + track_company_processed + end + + def acquisition_directory + @acquisition_directory || acquisition_directory_processing + end + + private + + def mark_acquisition_directory_as_finished_processing + File.rename(acquisition_directory_processing, acquisition_directory_final) + end + + def mark_finished_acquisition_directory_as_processing(processing_directory) + File.rename(acquisition_directory_final, processing_directory) + end + + def remove_current_processing_acquisition_directory + FileUtils.rm_rf(acquisition_directory_processing) + end + end + end +end diff --git a/lib/openc_bot/helpers/pseudo_machine_fetcher.rb b/lib/openc_bot/helpers/pseudo_machine_fetcher.rb new file mode 100644 index 0000000..9c5a297 --- /dev/null +++ b/lib/openc_bot/helpers/pseudo_machine_fetcher.rb @@ -0,0 +1,57 @@ +# frozen_string_literal: true + +require "openc_bot/helpers/persistence_handler" +require "openc_bot/helpers/pseudo_machine_register_methods" + +module OpencBot + module Helpers + # Fetching activities + module PseudoMachineFetcher + include OpencBot::Helpers::PsuedoMachineRegisterMethods + include OpencBot::Helpers::PersistenceHandler + + def dataset_based + const_defined?("DATASET_BASED") && const_get("DATASET_BASED") + end + + def run + fetch_data_results = fetch_data + # ignore stale for the moment + # update_stale_results = update_stale + res = {} + res.merge!(fetch_data_results) if fetch_data_results.is_a?(Hash) + res + end + + def fetch_data_via_dataset + # to be implemented by fetcher code that includes this + # should persist data using persist(datum) + end + + def fetch_data + start_time = Time.now.utc + res = {} + if use_alpha_search + fetch_data_via_alpha_search + res[:run_type] = "alpha" + elsif dataset_based + fetch_data_via_dataset + res[:run_type] = "dataset" + else + new_highest_numbers = fetch_data_via_incremental_search + res[:run_type] = "incremental" + res[:output] = "New highest numbers = #{new_highest_numbers.inspect}" + end + res.merge(fetched: records_processed, fetch_start: start_time, fetch_end: Time.now.utc) + rescue OpencBot::OutOfPermittedHours, OpencBot::SourceClosedForMaintenance, Interrupt, SystemExit => e + raise e unless caller.to_s[/update_data/] + + res.merge!({ fetch_data_output: { error: exception_to_object(e) } }) + rescue StandardError => e + raise e unless caller.to_s[/update_data/] + + res.merge!({ fetch_data_error: { error: exception_to_object(e) } }) + end + end + end +end diff --git a/lib/openc_bot/helpers/pseudo_machine_parser.rb b/lib/openc_bot/helpers/pseudo_machine_parser.rb new file mode 100644 index 0000000..b7af55d --- /dev/null +++ b/lib/openc_bot/helpers/pseudo_machine_parser.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +require "openc_bot/pseudo_machine_company_fetcher_bot" + +module OpencBot + module Helpers + # Parsing activities + module PseudoMachineParser + include OpencBot::PseudoMachineCompanyFetcherBot + + def input_stream + "fetcher" + end + + def parse(fetched_datum) + # implement in bot + # may return either: + # a single parsed datum + # an array of data from single fetched_datum, e.g. if fetched_datum is a CSV file + # call "yield(parsed_datum)" inside the parse method passing each parsed record, to persist one-by-one + end + + def run + start_time = Time.now.utc + counter = 0 + input_data do |fetched_datum| + yielded = false + # the parse method can use yield + parsed_data = parse(fetched_datum) do |parsed_datum| + yielded = true + next if parsed_datum.blank? + + persist(parsed_datum) + counter += 1 + end + + unless yielded + parsed_data = [parsed_data] unless parsed_data.is_a?(Array) + parsed_data.each do |parsed_datum| + next if parsed_datum.blank? + + persist(parsed_datum) + counter += 1 + end + end + end + { parsed: counter, parser_start: start_time, parser_end: Time.now.utc } + end + end + end +end diff --git a/lib/openc_bot/helpers/pseudo_machine_register_methods.rb b/lib/openc_bot/helpers/pseudo_machine_register_methods.rb new file mode 100644 index 0000000..4fd3efa --- /dev/null +++ b/lib/openc_bot/helpers/pseudo_machine_register_methods.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +require "openc_bot/helpers/register_methods" + +module OpencBot + module Helpers + # Configuration/Methods overrides for Pseduo Machine related activities + module PsuedoMachineRegisterMethods + include OpencBot::Helpers::RegisterMethods + + def save_entity(entity_datum) + prepare_and_save_data(entity_datum) + end + + def primary_key_name + const_defined?("PRIMARY_KEY_NAME") ? const_get("PRIMARY_KEY_NAME") : :company_number + end + + def exception_to_object(exp) + { klass: exp.class.to_s, message: exp.message, backtrace: exp.backtrace } + end + end + end +end diff --git a/lib/openc_bot/helpers/pseudo_machine_transformer.rb b/lib/openc_bot/helpers/pseudo_machine_transformer.rb new file mode 100644 index 0000000..eaabbd8 --- /dev/null +++ b/lib/openc_bot/helpers/pseudo_machine_transformer.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +require "openc_bot/helpers/persistence_handler" +require "openc_bot/pseudo_machine_company_fetcher_bot" +require "openc_bot/helpers/pseudo_machine_register_methods" + +module OpencBot + module Helpers + # Transformer activities + module PseudoMachineTransformer + include OpencBot + include OpencBot::PseudoMachineCompanyFetcherBot + include OpencBot::Helpers::PersistenceHandler + include OpencBot::Helpers::PsuedoMachineRegisterMethods + + def input_stream + "parser" + end + + def encapsulate_as_per_schema(parsed_datum) + # define in bot + # note: should explicitly include the jurisdiction code as this can't + # be inferred from the transformer name + end + + def run + counter = 0 + start_time = Time.now.utc + input_data do |json_data| + entity_datum = encapsulate_as_per_schema(json_data) + unless entity_datum.blank? + validation_errors = validate_datum(entity_datum) + raise "\n#{JSON.pretty_generate([entity_datum, validation_errors])}" unless validation_errors.blank? + + persist(entity_datum) + save_entity(entity_datum) unless ENV["NO_SAVE_DATA_IN_SQLITE"] + counter += 1 + end + end + res = { transformed: counter, transformer_start: start_time, transformer_end: Time.now.utc } + res.merge!({ no_transformed_data: true }) if counter == 0 + res + end + + def schema_name + super || "company-schema" + end + end + end +end diff --git a/lib/openc_bot/helpers/register_methods.rb b/lib/openc_bot/helpers/register_methods.rb index 227cb92..e10e70f 100644 --- a/lib/openc_bot/helpers/register_methods.rb +++ b/lib/openc_bot/helpers/register_methods.rb @@ -16,6 +16,10 @@ module RegisterMethods MAX_BUSY_RETRIES = 3 + def exception_to_object(exp) + { klass: exp.class.to_s, message: exp.message, backtrace: exp.backtrace } + end + def allowed_hours if const_defined?("ALLOWED_HOURS") const_get("ALLOWED_HOURS").to_a @@ -26,6 +30,16 @@ def allowed_hours end end + def allowed_weekend_hours + if const_defined?("ALLOWED_WEEKEND_HOURS") + const_get("ALLOWED_WEEKEND_HOURS").to_a + elsif const_defined?("TIMEZONE") + # See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones for definitions/examples + # eg TIMEZONE = "America/Panama" + (0..10).to_a + (12..22).to_a + end + end + def use_alpha_search const_defined?("USE_ALPHA_SEARCH") && const_get("USE_ALPHA_SEARCH") end @@ -97,8 +111,11 @@ def fetch_registry_page(company_number, options = {}) def in_prohibited_time? current_time = current_time_in_zone - - allowed_hours && !allowed_hours.include?(current_time.hour) && !current_time.saturday? && !current_time.sunday? + if current_time.saturday? || current_time.sunday? + allowed_weekend_hours && !allowed_weekend_hours.include?(current_time.hour) + else + allowed_hours && !allowed_hours.include?(current_time.hour) + end end def prepare_and_save_data(all_data, _options = {}) @@ -378,7 +395,7 @@ def _client(options = {}) end def _http_get(url, options = {}) - raise OutOfPermittedHours, "Request at #{Time.now} is not out business hours (#{allowed_hours})" if options[:restrict_to_out_of_hours] && in_prohibited_time? + raise OutOfPermittedHours, "Request at #{Time.now} is not out business hours (Weekdays: #{allowed_hours}, Weekends: #{allowed_weekend_hours})" if options[:restrict_to_out_of_hours] && in_prohibited_time? _client(options).get_content(url) end diff --git a/lib/openc_bot/helpers/reporting.rb b/lib/openc_bot/helpers/reporting.rb index 9952907..407487f 100644 --- a/lib/openc_bot/helpers/reporting.rb +++ b/lib/openc_bot/helpers/reporting.rb @@ -41,7 +41,7 @@ def send_error_report(exception, options = {}) def send_run_report(run_results = nil) subject = "#{name} successfully ran" db_filesize = File.size?(db_location) - body = "No problems to report. db is #{db_location}, #{db_filesize} bytes. Last modified: #{File.stat(db_location).mtime}" + body = "No problems to report. db is #{db_location}, #{db_filesize} bytes. Last modified: #{File.exist?(db_location) ? File.stat(db_location).mtime : ''}" body += "\nRun results = #{run_results.inspect}" unless run_results.blank? send_report(subject: subject, body: body) end diff --git a/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb new file mode 100644 index 0000000..0fb138c --- /dev/null +++ b/lib/openc_bot/pseudo_machine_company_fetcher_bot.rb @@ -0,0 +1,101 @@ +# frozen_string_literal: true + +require "openc_bot" +require "openc_bot/company_fetcher_bot" +require "openc_bot/helpers/persistence_handler" + +module OpencBot + # Psuedo machine fetcher bot top level class to orchestrate activities + module PseudoMachineCompanyFetcherBot + include OpencBot::CompanyFetcherBot + include OpencBot::Helpers::PersistenceHandler + + def bot_name + @bot_name ||= Dir.pwd.split("/").last + end + + def callable_from_file_name(underscore_file_name) + bot_klass = klass_from_file_name(underscore_file_name) + if bot_klass.respond_to?(:new) + bot_klass.new + else + bot_klass + end + end + + def klass_from_file_name(underscore_file_name) + camelcase_version = underscore_file_name.split("_").map(&:capitalize).join + Object.const_get(camelcase_version) + end + + def db_name + "#{bot_name.gsub('_', '').downcase}.db" + end + + def statsd_namespace + @statsd_namespace ||= begin + bot_env = ENV.fetch("FETCHER_BOT_ENV", "development").to_sym + StatsD.mode = bot_env + StatsD.server = "sys1:8125" + StatsD.logger = Logger.new("/dev/null") if bot_env != :production + + if respond_to?(:output_stream) + if respond_to?(:inferred_jurisdiction_code) && inferred_jurisdiction_code + "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{inferred_jurisdiction_code}" + elsif is_a?(Module) + jur_name = name.downcase.sub("companiesfetcher", "").sub(/::.*/, "") + "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{jur_name.chars.each_slice(2).map(&:join).join('_')}" + else + "pseudo_machine_bot.#{bot_env}.#{output_stream}.#{self.class.name.downcase}" + end + .sub("companiesfetcher", "").sub(/::.*/, "") + end + end + end + + def processing_states + return @processing_states unless @processing_states.nil? + + state_file = "#{acquisition_directory}/processing_states.json" + @processing_states = if File.exist?(state_file) + JSON.parse(IO.read(state_file)) + else + [] + end + end + + # Outline bot run logic. + def update_data(options = {}) + res = {} + bot_namespace = callable_from_file_name(bot_name) + unless processing_states.include?("fetcher") + res.merge!(bot_namespace::Fetcher.run) + processing_states << "fetcher" + end + unless processing_states.include?("parser") + res.merge!(bot_namespace::Parser.run) + processing_states << "parser" + end + unless @processing_states.include?("transformer") + res.merge!(bot_namespace::Transformer.run) + processing_states << "transformer" + end + if res[:no_transformed_data] + # we don't need to keep empty acquisitions + remove_current_processing_acquisition_directory + else + res[:data_directory] = acquisition_directory_final + # rename directory so it will be seen by importer + mark_acquisition_directory_as_finished_processing + end + raise "\n#{JSON.pretty_generate(res)}" if res.key?(:fetch_data_error) || res.key?(:update_stale_error) + + res + rescue StandardError => e + send_error_report(e, options) + raise e + ensure + IO.write("#{acquisition_directory}/processing_states.json", processing_states.to_json) if Dir.exist?(acquisition_directory) + end + end +end diff --git a/lib/openc_bot/tasks.rb b/lib/openc_bot/tasks.rb index 337a753..3802d12 100644 --- a/lib/openc_bot/tasks.rb +++ b/lib/openc_bot/tasks.rb @@ -46,6 +46,33 @@ def pid_dir create_bot("company") end + desc "Run the fetch task which does all the data fetching from the fetcher" + task :fetch do |_t, _args| + bot_name = get_bot_name + only_process_running("#{bot_name}-bot:fetch") do + require_relative File.join(Dir.pwd, "lib", bot_name) + callable_from_file_name(bot_name)::Fetcher.run + end + end + + desc "Parses the entries from fetch output of specific acqusition" + task :parse do |t, args| + bot_name = get_bot_name + only_process_running("#{bot_name}-#{t.name}-#{args[:acquisition_id]}") do + require_relative File.join(Dir.pwd, "lib", bot_name) + callable_from_file_name(bot_name)::Parser.run + end + end + + desc "Transforms the entries from parse output of specific acqusition" + task :transform do |t, args| + bot_name = get_bot_name + only_process_running("#{bot_name}-#{t.name}-#{args[:acquisition_id]}") do + require_relative File.join(Dir.pwd, "lib", bot_name) + callable_from_file_name(bot_name)::Transformer.run + end + end + desc "Perform a fetcher bot update_data run without reporting and with dev/debug options" task :run do |t, args| bot_name = get_bot_name @@ -80,9 +107,9 @@ def pid_dir require_relative File.join(Dir.pwd, "lib", bot_name) runner = callable_from_file_name(bot_name) - count = runner.run(options) + res = runner.run(options) - puts "Got #{count} records" + puts res.to_json end rescue Exception => e raise e unless e.message[/already running/i] diff --git a/schemas b/schemas index 051a03a..0a2f8ac 160000 --- a/schemas +++ b/schemas @@ -1 +1 @@ -Subproject commit 051a03a84b0a05d175e11737dc2a9ded0d9db647 +Subproject commit 0a2f8ac89be64caeb46c008aedf75f3e65015d84 diff --git a/spec/lib/openc_bot/helpers/persistence_handler_spec.rb b/spec/lib/openc_bot/helpers/persistence_handler_spec.rb new file mode 100644 index 0000000..c59f165 --- /dev/null +++ b/spec/lib/openc_bot/helpers/persistence_handler_spec.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +require "spec_helper" +require "openc_bot" +require "openc_bot/helpers/persistence_handler" + +module ModuleThatIncludesPersistenceHandlerFoo + extend OpencBot + extend OpencBot::Helpers::PersistenceHandler +end + +describe OpencBot::Helpers::PersistenceHandler do + context "when a module that includes PersistenceHandler" do + it "return's last word of module name" do + expect(ModuleThatIncludesPersistenceHandlerFoo.output_stream).to eq("foo") + end + end +end diff --git a/spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb b/spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb new file mode 100644 index 0000000..bfb5c4e --- /dev/null +++ b/spec/lib/openc_bot/helpers/pseudo_machine_fetcher_spec.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +require "spec_helper" +require "openc_bot" +require "openc_bot/helpers/pseudo_machine_fetcher" +require "openc_bot/helpers/persistence_handler" + +module ModuleThatIncludesPseudoMachineFetcher + extend OpencBot + extend OpencBot::Helpers::PseudoMachineFetcher +end + +describe OpencBot::Helpers::PseudoMachineFetcher do + context "when a module that includes PseudoMachineFetcher" do + it "has #run method" do + expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:run) + end + + it "includes PersistenceHandler methods" do + expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:persist) + end + + it "includes register_methods" do + expect(ModuleThatIncludesPseudoMachineFetcher).to respond_to(:fetch_data) + end + end +end diff --git a/spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb b/spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb new file mode 100644 index 0000000..dc7c477 --- /dev/null +++ b/spec/lib/openc_bot/helpers/pseudo_machine_parser_spec.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +require "spec_helper" +require "openc_bot" +require "openc_bot/helpers/pseudo_machine_parser" +# require "openc_bot/helpers/incremental_search" + +module ModuleThatIncludesPseudoMachineParser + extend OpencBot + extend OpencBot::Helpers::PseudoMachineParser +end + +describe OpencBot::Helpers::PseudoMachineParser +context "when a module that includes PseudoMachineParser" do + it "has #run method" do + expect(ModuleThatIncludesPseudoMachineParser).to respond_to(:run) + end + + it "includes PersistenceHandler methods" do + expect(ModuleThatIncludesPseudoMachineParser).to respond_to(:persist) + end + + it "return's fetcher" do + expect(ModuleThatIncludesPseudoMachineParser.input_stream).to eq("fetcher") + end +end diff --git a/spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb b/spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb new file mode 100644 index 0000000..bf2b8c3 --- /dev/null +++ b/spec/lib/openc_bot/helpers/pseudo_machine_transformer_spec.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +require "spec_helper" +require "openc_bot" +require "openc_bot/helpers/pseudo_machine_transformer" +# require "openc_bot/helpers/incremental_search" + +module ModuleThatIncludesPseudoMachineTransformer + extend OpencBot + extend OpencBot::Helpers::PseudoMachineTransformer +end + +describe OpencBot::Helpers::PseudoMachineTransformer +context "when a module that includes PseudoMachineTransformer" do + it "has #run method" do + expect(ModuleThatIncludesPseudoMachineTransformer).to respond_to(:run) + end + + it "includes PersistenceHandler methods" do + expect(ModuleThatIncludesPseudoMachineTransformer).to respond_to(:persist) + end + + it "includes RegisterMethods methods" do + expect(ModuleThatIncludesPseudoMachineTransformer).to respond_to(:validate_datum) + end + + it "return's parser" do + expect(ModuleThatIncludesPseudoMachineTransformer.input_stream).to eq("parser") + end +end diff --git a/spec/lib/openc_bot/helpers/register_methods_spec.rb b/spec/lib/openc_bot/helpers/register_methods_spec.rb index e410078..3687c07 100644 --- a/spec/lib/openc_bot/helpers/register_methods_spec.rb +++ b/spec/lib/openc_bot/helpers/register_methods_spec.rb @@ -1030,13 +1030,15 @@ module ModuleWithNoCustomPrimaryKey describe "in_prohibited_time?" do before do - allow(ModuleThatIncludesRegisterMethods).to receive(:allowed_hours).and_return((0..12)) + allow(ModuleThatIncludesRegisterMethods).to receive(:allowed_hours).and_return((0..12).to_a) + allow(ModuleThatIncludesRegisterMethods).to receive(:allowed_weekend_hours).and_return((0..10).to_a + (12..22).to_a) end it "returns true only if current_time_in_zone out of office hours" do times_and_truthiness = { "2014-10-09 04:14:25 +0100" => false, # weekday out of hours - "2014-10-11 15:14:25 +0100" => false, # in weekend + "2014-10-11 11:14:25 +0100" => true, # in weekend, out of ALLOWED weekend hours + "2014-10-11 15:14:25 +0100" => false, # in weekend, within ALLOWED weekend hours "2014-10-10 15:14:25 +0100" => true, # weekday in business hours } times_and_truthiness.each do |datetime, truthiness| @@ -1045,7 +1047,7 @@ module ModuleWithNoCustomPrimaryKey end end - it "returns false if allowed_hours not defined" do + it "returns nil if allowed_hours not defined" do expect(ModuleWithNoCustomPrimaryKey.in_prohibited_time?).to be_nil end end diff --git a/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb b/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb new file mode 100644 index 0000000..894675f --- /dev/null +++ b/spec/lib/pseudo_machine_company_fetcher_bot_spec.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +require_relative "../spec_helper" +require "openc_bot" +require "openc_bot/pseudo_machine_company_fetcher_bot" + +Mail.defaults do + delivery_method :test # no, don't send emails when testing +end + +module TestPseudoMachineCompaniesFetcher + extend OpencBot::PseudoMachineCompanyFetcherBot +end + +module OpencBot + describe PseudoMachineCompanyFetcherBot do + context "when a module extends PseudoMachineCompanyFetcherBot" do + it "includes CompanyFetcherBot methods" do + expect(TestPseudoMachineCompaniesFetcher).to respond_to(:inferred_jurisdiction_code) + end + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 2e346a2..8cfef50 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -8,6 +8,7 @@ WebMock.disable_net_connect!(allow_localhost: true) RSpec.configure do |config| + # noop end def remove_test_database