From f3ec54cc694413e7b4875c54c5a2351babf344cf Mon Sep 17 00:00:00 2001 From: Sergey Pustovalov Date: Fri, 5 Apr 2019 13:10:26 +0300 Subject: [PATCH] Optimize json import --- .ruby-version | 2 +- Gemfile | 9 +- Gemfile.lock | 21 +++- app/models/bus_service.rb | 6 ++ bin/test_benchmark.rb | 23 ++++ case-study.md | 100 ++++++++++++++++++ config/routes.rb | 3 + ...0190403191542_create_pghero_space_stats.rb | 13 +++ db/schema.rb | 11 +- lib/tasks/utils.rake | 94 ++++++++++++---- 10 files changed, 258 insertions(+), 24 deletions(-) create mode 100644 app/models/bus_service.rb create mode 100755 bin/test_benchmark.rb create mode 100644 case-study.md create mode 100644 db/migrate/20190403191542_create_pghero_space_stats.rb diff --git a/.ruby-version b/.ruby-version index 6a6a3d8..097a15a 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -2.6.1 +2.6.2 diff --git a/Gemfile b/Gemfile index 33017fd..9862e95 100644 --- a/Gemfile +++ b/Gemfile @@ -1,16 +1,23 @@ source 'https://rubygems.org' git_source(:github) { |repo| "https://github.com/#{repo}.git" } -ruby '2.6.1' +ruby '2.6.2' gem 'rails', '~> 5.2.3' gem 'pg', '>= 0.18', '< 2.0' gem 'puma', '~> 3.11' gem 'bootsnap', '>= 1.1.0', require: false +gem 'oj' +gem 'bulk_insert' +gem 'pry' group :development, :test do # Call 'byebug' anywhere in the code to stop execution and get a debugger console gem 'byebug', platforms: [:mri, :mingw, :x64_mingw] + gem 'benchmark-ips' + gem 'ruby-prof' + gem 'pghero' + gem 'pg_query', '>= 0.9.0' end group :development do diff --git a/Gemfile.lock b/Gemfile.lock index eb22e16..c050851 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -43,11 +43,15 @@ GEM minitest (~> 5.1) tzinfo (~> 1.1) arel (9.0.0) + benchmark-ips (2.7.2) bindex (0.6.0) bootsnap (1.4.2) msgpack (~> 1.0) builder (3.2.3) + bulk_insert (1.7.0) + activerecord (>= 3.2.0) byebug (11.0.1) + coderay (1.1.2) concurrent-ruby (1.1.5) crass (1.0.4) erubi (1.8.0) @@ -76,7 +80,14 @@ GEM nio4r (2.3.1) nokogiri (1.10.2) mini_portile2 (~> 2.4.0) + oj (3.7.11) pg (1.1.4) + pg_query (1.1.0) + pghero (2.2.0) + activerecord + pry (0.12.2) + coderay (~> 1.1.0) + method_source (~> 0.9.0) puma (3.12.1) rack (2.0.6) rack-test (1.1.0) @@ -109,6 +120,7 @@ GEM rb-fsevent (0.10.3) rb-inotify (0.10.0) ffi (~> 1.0) + ruby-prof (0.17.0) ruby_dep (1.5.0) sprockets (3.7.2) concurrent-ruby (~> 1.0) @@ -134,17 +146,24 @@ PLATFORMS ruby DEPENDENCIES + benchmark-ips bootsnap (>= 1.1.0) + bulk_insert byebug listen (>= 3.0.5, < 3.2) + oj pg (>= 0.18, < 2.0) + pg_query (>= 0.9.0) + pghero + pry puma (~> 3.11) rails (~> 5.2.3) + ruby-prof tzinfo-data web-console (>= 3.3.0) RUBY VERSION - ruby 2.6.1p33 + ruby 2.6.2p47 BUNDLED WITH 2.0.1 diff --git a/app/models/bus_service.rb b/app/models/bus_service.rb new file mode 100644 index 0000000..c842d9f --- /dev/null +++ b/app/models/bus_service.rb @@ -0,0 +1,6 @@ +class BusService < ApplicationRecord + self.table_name = 'buses_services' + + belongs_to :bus + belongs_to :service +end diff --git a/bin/test_benchmark.rb b/bin/test_benchmark.rb new file mode 100755 index 0000000..1b3ef7b --- /dev/null +++ b/bin/test_benchmark.rb @@ -0,0 +1,23 @@ +#!/usr/bin/env ruby +# +require 'benchmark' +# require 'fileutils' + +FILES = %w( + small.json + medium.json + large.json +).freeze + +APP_ROOT = File.expand_path('..', __dir__) + +# FileUtils.chdir APP_ROOT do +FILES.each do |fname| + result = Benchmark.measure do + puts "----------Load data from #{fname}----------" + `rake reload_json[#{APP_ROOT}/fixtures/#{fname}]` + end + puts result +end +# end + diff --git a/case-study.md b/case-study.md new file mode 100644 index 0000000..ad60c45 --- /dev/null +++ b/case-study.md @@ -0,0 +1,100 @@ +При изучении utils.rake первым делом в глаза бросается обилие find_or_create_by + +Решил посмотреть с помощью Benchmark.ips как они выполняются последовательно +``` +ActiveRecord::Base.transaction do + City.delete_all + Bus.delete_all + Service.delete_all + Trip.delete_all + ActiveRecord::Base.connection.execute('delete from buses_services;') + + Benchmark.ips do |x| + x.report('find_or_create_by City') do + json.each { |trip| City.find_or_create_by(name: trip['from']) } + end + + x.report('Find find_or_create_by services') do + json.each do |trip| + trip['bus']['services'].each do |service| + Service.find_or_create_by(name: service) + end + end + end + + x.report('find_or_create_by Bus') do + json.each { |trip| Bus.find_or_create_by(number: trip['bus']['number']) } + end + end +end +``` + +Получается так +``` +Warming up -------------------------------------- +find_or_create_by City + 1.000 i/100ms +Find find_or_create_by services + 1.000 i/100ms +find_or_create_by Bus + 1.000 i/100ms +Calculating ------------------------------------- +find_or_create_by City + 2.134 (± 0.0%) i/s - 11.000 in 5.167181s +Find find_or_create_by services + 0.491 (± 0.0%) i/s - 3.000 in 6.109831s +find_or_create_by Bus + 0.497 (± 0.0%) i/s - 3.000 in 6.090513s +``` + +при этом общее время импорта small.json +11.723580999998376 + +Окей, пробуем добавить индексы +Показатель времени стал хуже +13.188413999974728 + +Показатели бенчмарка не "взлетели в небеса" +``` +Warming up -------------------------------------- +find_or_create_by City + 1.000 i/100ms +Find find_or_create_by services + 1.000 i/100ms +find_or_create_by Bus + 1.000 i/100ms +Calculating ------------------------------------- +find_or_create_by City + 2.102 (± 0.0%) i/s - 11.000 in 5.260946s +Find find_or_create_by services + 0.510 (± 0.0%) i/s - 3.000 in 5.885058s +find_or_create_by Bus + 0.441 (± 0.0%) i/s - 3.000 in 6.919219s +``` + +Окей, индексы не выход (для импорта данных так уж точно) + +Другой день, импорт small.json занимает 18-20 секунд +пробую испольщовать gem `oj` + +13.56506800011266 s + +хмм, после серии тестов среднее время 13-15 секунд. Неплохо, оставляем + +Замена AR методов `delete all` на raw sql дает еще чуть выигрыш в пару секунд + +Окей, ставим и настраиваем pg_hero + +10 000 элементов в small.json генерируют 4,229 запросов `SELECT FROM services` и всего 10 `INSERT INTO services` + +Время работы сейчас важнее потребляемой памяти, попробую не делать на каждую строку find_or_create, а сделать массив и вставить c помощью bulk_insert + +Стало намного лучше +``` +----------Load data from small.json---------- + 0.000284 0.001181 1.671884 ( 2.216141) +----------Load data from medium.json---------- + 0.000137 0.000838 2.695500 ( 2.925685) +----------Load data from large.json---------- + 0.000154 0.000889 12.920520 ( 14.627362) +``` diff --git a/config/routes.rb b/config/routes.rb index a2da6a7..1920c7c 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -1,5 +1,8 @@ Rails.application.routes.draw do # For details on the DSL available within this file, see http://guides.rubyonrails.org/routing.html + + mount PgHero::Engine, at: "pghero" + get "/" => "statistics#index" get "автобусы/:from/:to" => "trips#index" end diff --git a/db/migrate/20190403191542_create_pghero_space_stats.rb b/db/migrate/20190403191542_create_pghero_space_stats.rb new file mode 100644 index 0000000..6198a6f --- /dev/null +++ b/db/migrate/20190403191542_create_pghero_space_stats.rb @@ -0,0 +1,13 @@ +class CreatePgheroSpaceStats < ActiveRecord::Migration[5.2] + def change + create_table :pghero_space_stats do |t| + t.text :database + t.text :schema + t.text :relation + t.integer :size, limit: 8 + t.timestamp :captured_at + end + + add_index :pghero_space_stats, [:database, :captured_at] + end +end diff --git a/db/schema.rb b/db/schema.rb index f6921e4..7730d58 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2019_03_30_193044) do +ActiveRecord::Schema.define(version: 2019_04_03_191542) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -29,6 +29,15 @@ t.string "name" end + create_table "pghero_space_stats", force: :cascade do |t| + t.text "database" + t.text "schema" + t.text "relation" + t.bigint "size" + t.datetime "captured_at" + t.index ["database", "captured_at"], name: "index_pghero_space_stats_on_database_and_captured_at" + end + create_table "services", force: :cascade do |t| t.string "name" end diff --git a/lib/tasks/utils.rake b/lib/tasks/utils.rake index 540fe87..d46fe3b 100644 --- a/lib/tasks/utils.rake +++ b/lib/tasks/utils.rake @@ -1,34 +1,88 @@ # Наивная загрузка данных из json-файла в БД # rake reload_json[fixtures/small.json] + task :reload_json, [:file_name] => :environment do |_task, args| json = JSON.parse(File.read(args.file_name)) ActiveRecord::Base.transaction do - City.delete_all - Bus.delete_all - Service.delete_all - Trip.delete_all - ActiveRecord::Base.connection.execute('delete from buses_services;') + + ActiveRecord::Base.connection.execute <<-SQL + delete from cities; + delete from buses; + delete from services; + delete from trips; + delete from buses_services; + SQL + + cities = Set.new + services = Set.new + buses = Set.new + buses_services = Set.new + trips = Set.new json.each do |trip| - from = City.find_or_create_by(name: trip['from']) - to = City.find_or_create_by(name: trip['to']) - services = [] - trip['bus']['services'].each do |service| - s = Service.find_or_create_by(name: service) - services << s + cities << { name: trip['from'] } + cities << { name: trip['to'] } + buses << { number: trip['bus']['number'], model: trip['bus']['model'] } + trip['bus']['services'].each do |service_name| + services << { name: service_name } + buses_services << { bus_number: trip['bus']['number'], service_name: service_name } end - bus = Bus.find_or_create_by(number: trip['bus']['number']) - bus.update(model: trip['bus']['model'], services: services) - - Trip.create!( - from: from, - to: to, - bus: bus, + trips << { + from_name: trip['from'], + to_name: trip['to'], + bus_number: trip['bus']['number'], start_time: trip['start_time'], duration_minutes: trip['duration_minutes'], - price_cents: trip['price_cents'], - ) + price_cents: trip['price_cents'] + } + end + + City.bulk_insert do |worker| + cities.each do |city_attrs| + worker.add(city_attrs) + end + end + + Service.bulk_insert do |worker| + services.each do |service_attrs| + worker.add(service_attrs) + end + end + + Bus.bulk_insert do |worker| + buses.each do |bus_attrs| + worker.add(bus_attrs) + end + end + + cities_objects = City.pluck(:name, :id).to_h + services_objects = Service.all.index_by(&:name) + buses_objects = Bus.all.index_by(&:number) + + BusService.bulk_insert do |worker| + buses_services.each do |bs| + bus_id = buses_objects[bs[:bus_number]].id + service_id = services_objects[bs[:service_name]].id + worker.add(bus_id: bus_id, service_id: service_id) + end + end + + Trip.bulk_insert do |worker| + trips.each do |trip| + from_id = cities_objects[trip[:from_name]] + to_id = cities_objects[trip[:to_name]] + bus_id = buses_objects[trip[:bus_number]].id + + worker.add( + from_id: from_id, + to_id: to_id, + bus_id: bus_id, + start_time: trip[:start_time], + duration_minutes: trip[:duration_minutes], + price_cents: trip[:duration_minutes] + ) + end end end end