Skip to content

Commit a92a617

Browse files
committed
Implement multi-language mapping
1 parent 28e96e4 commit a92a617

File tree

3 files changed

+219
-64
lines changed

3 files changed

+219
-64
lines changed

lib/interscript.rb

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
# Transliteration
66
module Interscript
7-
87
class InvalidSystemError < StandardError; end
98
class ExternalProcessNotRecognizedError < StandardError; end
109
class ExternalProcessUnavailableError < StandardError; end
@@ -19,13 +18,13 @@ class ExternalProcessUnavailableError < StandardError; end
1918

2019
class << self
2120

22-
def transliterate(system_code, string, maps={})
21+
def transliterate(system_code, string, maps={}, options={})
2322
system_code = map_resolve(system_code)
2423

2524
unless maps.has_key? system_code
2625
maps[system_code] = Interscript::Mapping.for(system_code)
2726
end
28-
# mapping = Interscript::Mapping.for(system_code)
27+
2928
mapping = maps[system_code]
3029

3130
# First, apply chained transliteration as specified in the list `chain`
@@ -43,6 +42,7 @@ def transliterate(system_code, string, maps={})
4342
charmap = mapping.characters_hash
4443
dictmap = mapping.dictionary_hash
4544
trie = mapping.dictionary_trie
45+
language = options[:language] || mapping.language
4646

4747
string = external_processing(mapping, string)
4848

@@ -70,19 +70,8 @@ def transliterate(system_code, string, maps={})
7070
output = string.clone
7171
offsets = Array.new string.to_s.size, 1
7272

73-
# mapping.rules.each do |r|
74-
# string.to_s.scan(/#{r['pattern']}/) do |matches|
75-
# match = Regexp.last_match
76-
# pos = match.offset(0).first
77-
# result = r['result'].clone
78-
# matches.each.with_index { |v, i| result.sub!(/\\#{i + 1}/, v) } if matches.is_a? Array
79-
# result.upcase! if up_case_around?(string, pos)
80-
# output[offsets[0...pos].sum, match[0].size] = result
81-
# offsets[pos] += result.size - match[0].size
82-
# end
83-
# end
84-
8573
mapping.rules.each do |r|
74+
next unless r["language"].nil? || r["language"].include?(language)
8675
next unless output
8776
re = mkregexp(r["pattern"])
8877
output = output.gsub(re, r["result"])

maps/icao-mul-Cyrl-Latn-2015.yaml

Lines changed: 214 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
authority_id: icao
33
id: 9303
44
language: iso-639-2:mul
5+
supported_languages: [iso-639-2:rus, iso-639-2:bel, iso-639-2:ukr, iso-639-2:mkd, iso-639-2:srb ]
56
source_script: Cyrl
67
destination_script: Latn
78
name: "Doc 9303: Machine Readable Travel Documents, Part 3: Specifications Common to all MRTDs, Seventh Edition, 2015"
@@ -20,9 +21,171 @@ description: |
2021
2122
This document defines the transliteration mappings used to produce
2223
this transcription or transliteration.
24+
2325
tests:
26+
- source: Бабрыковіч Аляксандр
27+
expected: Babrykovich Aliaksandr
28+
language: iso-639-2:bel
29+
- source: Міховіч Марыя
30+
expected: Mikhovich Maryia
31+
language: iso-639-2:bel
32+
- source: Максім
33+
expected: Maksim
34+
language: iso-639-2:bel
35+
- source: Іван
36+
expected: Ivan
37+
language: iso-639-2:bel
38+
- source: СВЯТЛАНА
39+
expected: SVIATLANA
40+
language: iso-639-2:bel
41+
- source: Ігар
42+
expected: Ihar
43+
language: iso-639-2:bel
44+
- source: Палто Алена
45+
expected: Palto Alena
46+
language: iso-639-2:bel
47+
- source: Мікалай
48+
expected: Mikalai
49+
language: iso-639-2:bel
50+
# https://en.wikipedia.org/wiki/Machine-readable_passport#Names
51+
- source: Горбачёв
52+
expected: Gorbachev
53+
language: iso-639-2:rus
54+
- source: Горбачёв
55+
expected: Horbachiov
56+
language: iso-639-2:bel
57+
- source: Алексей
58+
expected: Aleksei
59+
language: iso-639-2:rus
60+
- source: Академика Королёва
61+
expected: Akademika Koroleva
62+
language: iso-639-2:rus
63+
- source: улица Бирюлёвская
64+
expected: ulitsa Biriulevskaia
65+
language: iso-639-2:rus
66+
- source: Врубеля Улица
67+
expected: Vrubelia Ulitsa
68+
language: iso-639-2:rus
69+
- source: Люблинская
70+
expected: Liublinskaia
71+
language: iso-639-2:rus
72+
# https://news.tut.by/society/650761.html
73+
- source: Мария Рудь
74+
expected: Mariia Rud
75+
language: iso-639-2:rus
76+
- source: Мария Рудь
77+
expected: Mariia Rud
78+
language: iso-639-2:bel
79+
# https://pasport.org.ua/ru/vazhno/transliteratsiya
80+
- source: Олександр
81+
expected: Oleksandr
82+
language: iso-639-2:urk
2483

2584
map:
85+
rules:
86+
- pattern: \u0401
87+
result: IO
88+
language: [ iso-639-2:bel ]
89+
- pattern: (?<!\b\u2019)\b\u0404
90+
result: YE
91+
language: [ iso-639-2:ukr ]
92+
- pattern: (?<!\b\u2019)\b\u0407
93+
result: YI
94+
language: [ iso-639-2:ukr ]
95+
- pattern: \u040C
96+
result: KJ
97+
language: [ iso-639-2:mkd ]
98+
- pattern: \u040F
99+
result: DJ
100+
language: [ iso-639-2::mkd ]
101+
- pattern: \u0413
102+
result: H
103+
language: [ iso-639-2:bel, iso-639-2:srb, iso-639-2:ukr ]
104+
- pattern: \u0416
105+
result: Z
106+
language: [ iso-639-2:srb ]
107+
- pattern: \u0418
108+
result: Y
109+
language: [ iso-639-2:ukr ]
110+
- pattern: (?<!\b\u2019)\b\u0419
111+
result: Y
112+
language: [ iso-639-2:ukr ]
113+
- pattern: \u0425
114+
result: H
115+
language: [ iso-639-2:srb, iso-639-2:mkd ]
116+
- pattern: \u0426
117+
result: C
118+
language: [ iso-639-2:srb, iso-639-2:mkd ]
119+
- pattern: \u0427
120+
result: C
121+
language: [ iso-639-2:srb ]
122+
- pattern: \u0428
123+
result: S
124+
language: [ iso-639-2:srb ]
125+
- pattern: \u0429
126+
result: SHT
127+
language: [ iso-639-2:bul ]
128+
- pattern: (?<!\b\u2019)\b\u042E
129+
result: YA
130+
language: [ iso-639-2:ukr ]
131+
- pattern: (?<!\b\u2019)\b\u042F
132+
result: YA
133+
language: [ iso-639-2:urk ]
134+
- pattern: \u0492
135+
result: GJ
136+
language: [ iso-639-2:mkd ]
137+
- pattern: \u0451
138+
result: io
139+
language: [ iso-639-2:bel ]
140+
- pattern: (?<!\b\u2019)\b\u0454
141+
result: ye
142+
language: [ iso-639-2:ukr ]
143+
- pattern: (?<!\b\u2019)\b\u0457
144+
result: yi
145+
language: [ iso-639-2:ukr ]
146+
- pattern: \u045C
147+
result: kj
148+
language: [ iso-639-2:mkd ]
149+
- pattern: \u045F
150+
result: dj
151+
language: [ iso-639-2:mkd ]
152+
- pattern: \u0433
153+
result: h
154+
language: [ iso-639-2:bel, iso-639-2:srb, iso-639-2:ukr ]
155+
- pattern: \u0436
156+
result: z
157+
language: [ iso-639-2:srb ]
158+
- pattern: \u0438
159+
result: y
160+
language: [ iso-639-2:ukr ]
161+
- pattern: (?<!\b\u2019)\b\u0439
162+
result: y
163+
language: [ iso-639-2:ukr ]
164+
- pattern: \u0445
165+
result: h
166+
language: [ iso-639-2:srb, iso-639-2:mkd ]
167+
- pattern: \u0446
168+
result: c
169+
language: [ iso-639-2:srb, iso-639-2:mkd ]
170+
- pattern: \u0447
171+
result: c
172+
language: [ iso-639-2:srb ]
173+
- pattern: \u0448
174+
result: s
175+
language: [ iso-639-2:srb ]
176+
- pattern: \u0449
177+
result: sht
178+
language: [ iso-639-2:bul ]
179+
- pattern: \u044E
180+
result: yu
181+
language: [ iso-639-2:ukr ]
182+
- pattern: \u044F
183+
result: ya
184+
language: [ iso-639-2:ukr ]
185+
- pattern: \u0493
186+
result: gj
187+
language: [ iso-639-2:mkd ]
188+
26189
characters:
27190
# A. Transliteration of Multinational Latin-based Characters
28191
"\u00C0": "A" # À
@@ -81,7 +244,6 @@ map:
81244
"\u012C": "I" # Ĭ
82245
"\u012E": "I" # Į
83246
"\u0130": "I" # İ
84-
"\u0049": "I" # I
85247
"\u0132": "IJ" # IJ
86248
"\u0134": "J" # Ĵ
87249
"\u0136": "K" # Ķ
@@ -173,52 +335,51 @@ map:
173335
"\u0125": "h" # ĥ
174336
"\u0127": "h" # ħ
175337
"\u0129": "i" # ĩ
176-
"\u012B": "I" # ī
177-
"\u012D": "I" # ĭ
178-
"\u012F": "I" # į
179-
"\u0069": "I" #
180-
"\u0131": "I" # i
181-
"\u0133": "IJ" # ij
182-
"\u0135": "J" # ĵ
183-
"\u0137": "K" # ķ
184-
"\u013A": "L" # ĺ
185-
"\u013C": "L" # ļ
186-
"\u013E": "L" # ľ
187-
"\u0140": "L" # ŀ
188-
"\u0142": "L" # ł
189-
"\u0144": "N" # ń
190-
"\u0146": "N" # ņ
191-
"\u0148": "N" # ň
192-
"\u014B": "N" # ŋ
193-
"\u014D": "O" # ō
194-
"\u014F": "O" # ŏ
195-
"\u0151": "O" # ő
196-
"\u0153": "OE" # œ
197-
"\u0155": "R" # ŕ
198-
"\u0157": "R" # ŗ
199-
"\u0159": "R" # ř
200-
"\u015B": "S" # ś
201-
"\u015D": "S" # ŝ
202-
"\u015F": "S" # ş
203-
"\u0161": "S" # š
204-
"\u0163": "T" # ţ
205-
"\u0165": "T" # ť
206-
"\u0167": "T" # ŧ
207-
"\u0169": "U" # ũ
208-
"\u016B": "U" # ū
209-
"\u016D": "U" # ŭ
210-
"\u016F": "U" # ů
211-
"\u0171": "U" # ű
212-
"\u0173": "U" # ų
213-
"\u0175": "W" # ŵ
214-
"\u0177": "Y" # ŷ
215-
"\u00FF": "Y" # ÿ
216-
"\u017A": "Z" # ź
217-
"\u017C": "Z" # ż
218-
"\u017E": "Z" # ž
338+
"\u012B": "i" # ī
339+
"\u012D": "i" # ĭ
340+
"\u012F": "i" # į
341+
"\u0131": "i" # i
342+
"\u0133": "ij" # ij
343+
"\u0135": "j" # ĵ
344+
"\u0137": "k" # ķ
345+
"\u013A": "l" # ĺ
346+
"\u013C": "l" # ļ
347+
"\u013E": "l" # ľ
348+
"\u0140": "l" # ŀ
349+
"\u0142": "l" # ł
350+
"\u0144": "n" # ń
351+
"\u0146": "n" # ņ
352+
"\u0148": "n" # ň
353+
"\u014B": "n" # ŋ
354+
"\u014D": "o" # ō
355+
"\u014F": "o" # ŏ
356+
"\u0151": "o" # ő
357+
"\u0153": "oe" # œ
358+
"\u0155": "r" # ŕ
359+
"\u0157": "r" # ŗ
360+
"\u0159": "r" # ř
361+
"\u015B": "s" # ś
362+
"\u015D": "s" # ŝ
363+
"\u015F": "s" # ş
364+
"\u0161": "s" # š
365+
"\u0163": "t" # ţ
366+
"\u0165": "t" # ť
367+
"\u0167": "t" # ŧ
368+
"\u0169": "u" # ũ
369+
"\u016B": "u" # ū
370+
"\u016D": "u" # ŭ
371+
"\u016F": "u" # ů
372+
"\u0171": "u" # ű
373+
"\u0173": "u" # ų
374+
"\u0175": "w" # ŵ
375+
"\u0177": "y" # ŷ
376+
"\u00FF": "y" # ÿ
377+
"\u017A": "z" # ź
378+
"\u017C": "z" # ż
379+
"\u017E": "z" # ž
219380

220381
# B. Transliteration of Cyrillic Characters
221-
"\u0401": "E" # Ё (except Belorussian = IO)
382+
"\u0401": "E" # Ё (except Belorussian = IO)
222383
"\u0402": "D" # Ћ
223384
"\u0404": "IE" # Є (except if Ukrainian first character, then = YE)
224385
"\u0405": "DZ" # Ѕ
@@ -229,11 +390,11 @@ map:
229390
"\u040A": "NJ" # Њ
230391
"\u040C": "K" # Ќ (except in the language spoken in the former Yugoslav Republic of Macedonia = KJ)
231392
"\u040E": "U" # ў
232-
"\u040F": "DZ" # Џ (except in the language spoken in the former Yugoslav Republic of Macedonia = DJ)
393+
"\u040F": "DZ" # Џ (except in the language spoken in the former Yugoslav Republic of Macedonia = DJ)
233394
"\u0410": "A" # А
234395
"\u0411": "B" # Б
235396
"\u0412": "V" # В
236-
"\u0413": "G" # Г (except Belorussian, Serbian, and Ukrainian = H)
397+
"\u0413": "G" # Г (except Belorussian, Serbian, and Ukrainian = H)
237398
"\u0414": "D" # Д
238399
"\u0415": "E" # Е
239400
"\u0416": "ZH" # Ж (except Serbian = Z)
@@ -279,7 +440,7 @@ map:
279440
"\u045C": "k" # ќ (except in the language spoken in the former Yugoslav Republic of Macedonia = kj)
280441
"\u045E": "u" # ў
281442
"\u045F": "dz" # џ (except in the language spoken in the former Yugoslav Republic of Macedonia = dj)
282-
"\u0410": "a" # а
443+
"\u0430": "a" # а
283444
"\u0431": "b" # б
284445
"\u0432": "v" # в
285446
"\u0433": "g" # г (except Belorussian, Serbian, and Ukrainian = h)
@@ -315,3 +476,8 @@ map:
315476
"\u0491": "g" # ґ
316477
"\u0493": "g" # ғ (except in the language spoken in the former Yugoslav Republic of Macedonia = gj)
317478
"\u04BB": "c" # һ
479+
480+
# Soft sign transliteration don't defined by standard so it's skipped
481+
# https://ru.wikipedia.org/wiki/Транслитерация_русского_алфавита_латиницей#cite_note-tt12-19
482+
"\u042C": "" # Ь
483+
"\u044C": "" # ь

spec/interscript_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
system["tests"]&.uniq&.reduce([]) do |_, test|
2727
it "test for #{test}" do
2828
Timeout::timeout(5) do
29-
result = Interscript.transliterate(system_name, test["source"], cache)
29+
result = Interscript.transliterate(system_name, test["source"], cache, { :language => test["language"] })
3030
expected = test["expected"]&.unicode_normalize
3131
expect(result).to eq(expected)
3232
end

0 commit comments

Comments
 (0)