-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnormalization.go
More file actions
85 lines (78 loc) · 2.24 KB
/
normalization.go
File metadata and controls
85 lines (78 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
package xstrings
import (
"regexp"
"strings"
"github.com/avito-tech/normalize"
)
var addressLowercases = map[string]bool{
"da": true,
"das": true,
"do": true,
"dos": true,
"de": true,
}
func NormalizeForAddress(s string) string {
v := normalize.Normalize(s, withRemoveSpecialChars(),
normalize.WithFixRareCyrillicChars(),
normalize.WithCyrillicToLatinLookAlike(),
normalize.WithUmlautToLatinLookAlike())
v = leadClosingWhitespacePattern.ReplaceAllString(v, "")
v = insideWhitespacePattern.ReplaceAllString(v, " ")
vsplit := strings.Split(v, " ")
vb := new(strings.Builder)
for i, s := range vsplit {
if i > 0 {
vb.WriteString(" ")
}
if len(s) > 1 {
if addressLowercases[strings.ToLower(s)] {
vb.WriteString(strings.ToLower(s))
} else {
srunes := []rune(s)
vb.WriteString(strings.ToUpper(string(srunes[0])))
vb.WriteString(strings.ToLower(string(srunes[1:])))
}
} else {
vb.WriteString(strings.ToLower(s))
}
}
return vb.String()
}
func NormalizeForName(s string) string {
v := normalize.Normalize(s,
normalize.WithFixRareCyrillicChars(),
normalize.WithCyrillicToLatinLookAlike(),
normalize.WithUmlautToLatinLookAlike())
v = leadClosingWhitespacePattern.ReplaceAllString(v, "")
v = insideWhitespacePattern.ReplaceAllString(v, " ")
vsplit := strings.Split(v, " ")
vb := new(strings.Builder)
for i, s := range vsplit {
if i > 0 {
vb.WriteString(" ")
}
if len(s) > 1 {
if addressLowercases[strings.ToLower(s)] {
vb.WriteString(strings.ToLower(s))
} else {
srunes := []rune(s)
vb.WriteString(strings.ToUpper(string(srunes[0])))
vb.WriteString(strings.ToLower(string(srunes[1:])))
}
} else {
vb.WriteString(strings.ToLower(s))
}
}
return vb.String()
}
var (
specialCharsPattern = regexp.MustCompile(`(?i:[^äãõéáíóñöüa-zа-яё0-9\s])`)
leadClosingWhitespacePattern = regexp.MustCompile(`^[\s\p{Zs}]+|[\s\p{Zs}]+$`)
insideWhitespacePattern = regexp.MustCompile(`[\s\p{Zs}]{2,}`)
)
// withRemoveSpecialChars any char except latin/cyrillic letters, German umlauts (`ä`, `ö`, `ü`) and digits are removed
func withRemoveSpecialChars() normalize.Option {
return func(str string) string {
return specialCharsPattern.ReplaceAllString(str, "")
}
}