Skip to content

Commit 58da361

Browse files
authored
String cleaner > FR > improvements (#27)
1 parent 0ecc208 commit 58da361

3 files changed

Lines changed: 145 additions & 24 deletions

File tree

src/main/java/org/mtransit/commons/CleanUtils.java

Lines changed: 54 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import java.util.regex.Matcher;
2121
import java.util.regex.Pattern;
2222

23+
import kotlin.text.Regex;
24+
2325
@SuppressWarnings({"unused", "WeakerAccess"})
2426
public final class CleanUtils {
2527

@@ -101,17 +103,19 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) {
101103
return label.trim();
102104
}
103105

104-
private static final String PLACE_CHAR_DE_L = "de l'";
105-
private static final String PLACE_CHAR_DE_LA = "de la ";
106-
private static final String PLACE_CHAR_D = "d'";
107-
private static final String PLACE_CHAR_DE = "de ";
108-
private static final String PLACE_CHAR_DES = "des ";
109-
private static final String PLACE_CHAR_DU = "du ";
110-
private static final String PLACE_CHAR_LA = "la ";
111-
private static final String PLACE_CHAR_LE = "le ";
112-
private static final String PLACE_CHAR_LES = "les ";
113-
private static final String PLACE_CHAR_L = "l'";
114-
106+
static final String PLACE_CHAR_DE_L = "de l'";
107+
static final String PLACE_CHAR_DE_LA = "de la ";
108+
static final String PLACE_CHAR_D = "d'";
109+
static final String PLACE_CHAR_DE = "de ";
110+
static final String PLACE_CHAR_DES = "des ";
111+
static final String PLACE_CHAR_DU = "du ";
112+
static final String PLACE_CHAR_LA = "la ";
113+
static final String PLACE_CHAR_LE = "le ";
114+
static final String PLACE_CHAR_LES = "les ";
115+
static final String PLACE_CHAR_L = "l'";
116+
117+
@SuppressWarnings("DeprecatedIsStillUsed")
118+
@Deprecated
115119
private static final Pattern[] START_WITH_CHARS = new Pattern[]{ //
116120
Pattern.compile("^(" + PLACE_CHAR_DE_L + ")", Pattern.CASE_INSENSITIVE), //
117121
Pattern.compile("^(" + PLACE_CHAR_DE_LA + ")", Pattern.CASE_INSENSITIVE), //
@@ -125,6 +129,8 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) {
125129
Pattern.compile("^(" + PLACE_CHAR_L + ")", Pattern.CASE_INSENSITIVE) //
126130
};
127131

132+
@SuppressWarnings("unused")
133+
@Deprecated
128134
public static final Pattern[] SPACE_CHARS = new Pattern[]{ //
129135
Pattern.compile("( " + PLACE_CHAR_DE_L + ")", Pattern.CASE_INSENSITIVE), //
130136
Pattern.compile("( " + PLACE_CHAR_DE_LA + ")", Pattern.CASE_INSENSITIVE), //
@@ -138,6 +144,8 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) {
138144
Pattern.compile("( " + PLACE_CHAR_L + ")", Pattern.CASE_INSENSITIVE) //
139145
};
140146

147+
@SuppressWarnings("DeprecatedIsStillUsed")
148+
@Deprecated
141149
private static final Pattern[] SLASH_CHARS = new Pattern[]{//
142150
Pattern.compile("(/ " + PLACE_CHAR_DE_L + ")", Pattern.CASE_INSENSITIVE), //
143151
Pattern.compile("(/ " + PLACE_CHAR_DE_LA + ")", Pattern.CASE_INSENSITIVE), //
@@ -151,19 +159,24 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) {
151159
Pattern.compile("(/ " + PLACE_CHAR_L + ")", Pattern.CASE_INSENSITIVE) //
152160
};
153161

154-
private static final String PLACE_CHAR_ARRONDISSEMENT = "arrondissement ";
155-
private static final String PLACE_CHAR_AV = "av ";
156-
private static final String PLACE_CHAR_AVENUE = "avenue ";
157-
private static final String PLACE_CHAR_BOUL = "boul ";
158-
private static final String PLACE_CHAR_BOULEVARD = "boulevard ";
159-
private static final String PLACE_CHAR_CH = "ch ";
160-
private static final String PLACE_CHAR_CIVIQUE = "civique ";
161-
private static final String PLACE_CHAR_CROISS = "croiss ";
162-
private static final String PLACE_CHAR_QUARTIER = "quartier ";
163-
private static final String PLACE_CHAR_RTE = "rte ";
164-
private static final String PLACE_CHAR_RUE = "rue ";
165-
private static final String PLACE_CHAR_TSSE = "tsse ";
166-
162+
public static final Regex ALL_CHARS_REGEX = CleanUtilsExtKt.makeALL_CHARS_REGEX();
163+
public static final String ALL_CHARS_REGEX_REPLACEMENT = CleanUtilsExtKt.makeALL_CHARS_REGEX_REPLACEMENT();
164+
165+
static final String PLACE_CHAR_ARRONDISSEMENT = "arrondissement ";
166+
static final String PLACE_CHAR_AV = "av ";
167+
static final String PLACE_CHAR_AVENUE = "avenue ";
168+
static final String PLACE_CHAR_BOUL = "boul ";
169+
static final String PLACE_CHAR_BOULEVARD = "boulevard ";
170+
static final String PLACE_CHAR_CH = "ch ";
171+
static final String PLACE_CHAR_CIVIQUE = "civique ";
172+
static final String PLACE_CHAR_CROISS = "croiss ";
173+
static final String PLACE_CHAR_QUARTIER = "quartier ";
174+
static final String PLACE_CHAR_RTE = "rte ";
175+
static final String PLACE_CHAR_RUE = "rue ";
176+
static final String PLACE_CHAR_TSSE = "tsse ";
177+
178+
@SuppressWarnings("DeprecatedIsStillUsed")
179+
@Deprecated
167180
private static final Pattern[] START_WITH_ST = new Pattern[]{ //
168181
Pattern.compile("^(" + PLACE_CHAR_ARRONDISSEMENT + ")", Pattern.CASE_INSENSITIVE), //
169182
Pattern.compile("^(" + PLACE_CHAR_AV + ")", Pattern.CASE_INSENSITIVE), //
@@ -179,6 +192,8 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) {
179192
Pattern.compile("^(" + PLACE_CHAR_TSSE + ")", Pattern.CASE_INSENSITIVE) //
180193
};
181194

195+
@SuppressWarnings("unused")
196+
@Deprecated
182197
public static final Pattern[] SPACE_ST = new Pattern[]{ //
183198
Pattern.compile("( " + PLACE_CHAR_ARRONDISSEMENT + ")", Pattern.CASE_INSENSITIVE), //
184199
Pattern.compile("( " + PLACE_CHAR_AV + ")", Pattern.CASE_INSENSITIVE), //
@@ -194,6 +209,8 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) {
194209
Pattern.compile("( " + PLACE_CHAR_TSSE + ")", Pattern.CASE_INSENSITIVE) //
195210
};
196211

212+
@SuppressWarnings("DeprecatedIsStillUsed")
213+
@Deprecated
197214
private static final Pattern[] SLASH_ST = new Pattern[]{ //
198215
Pattern.compile("(/ " + PLACE_CHAR_ARRONDISSEMENT + ")", Pattern.CASE_INSENSITIVE), //
199216
Pattern.compile("(/ " + PLACE_CHAR_AV + ")", Pattern.CASE_INSENSITIVE), //
@@ -209,6 +226,9 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) {
209226
Pattern.compile("(/ " + PLACE_CHAR_TSSE + ")", Pattern.CASE_INSENSITIVE) //
210227
};
211228

229+
public static final Regex ALL_ST_REGEX = CleanUtilsExtKt.makeALL_ST_REGEX();
230+
public static final String ALL_ST_REGEX_REPLACEMENT = CleanUtilsExtKt.makeALL_ST_REGEX_REPLACEMENT();
231+
212232
@NotNull
213233
public static Pattern cleanWord(@NotNull String word) {
214234
return cleanWords(word);
@@ -355,9 +375,13 @@ public static String cleanLabelFR(@NotNull String label) {
355375
label = CLEAN_PARENTHESIS2.matcher(label).replaceAll(CLEAN_PARENTHESIS2_REPLACEMENT);
356376
label = SAINT.matcher(label).replaceAll(SAINT_REPLACEMENT);
357377
label = removePointsI(label); // after capitalize
378+
//noinspection deprecation
358379
label = RegexUtils.replaceAllNN(label.trim(), START_WITH_ST, SPACE); // Constants.EMPTY); // SPACE);
380+
//noinspection deprecation
359381
label = RegexUtils.replaceAllNN(label, SLASH_ST, SLASH_SPACE);
382+
//noinspection deprecation
360383
label = RegexUtils.replaceAllNN(label.trim(), START_WITH_CHARS, SPACE); // , Constants.EMPTY); //
384+
//noinspection deprecation
361385
label = RegexUtils.replaceAllNN(label, SLASH_CHARS, SLASH_SPACE);
362386
return cleanLabel(Locale.FRENCH, label);
363387
}
@@ -595,6 +619,9 @@ private static boolean containsIgnoreCase(@Nullable String string, @NotNull Stri
595619
return false;
596620
}
597621

622+
public static final Regex ALL_FACE_A_REGEX = CleanUtilsExtKt.makeALL_FACE_A_REGEX();
623+
public static final String ALL_FACE_A_REGEX_REPLACEMENT = CleanUtilsExtKt.makeALL_FACE_A_REGEX_REPLACEMENT();
624+
598625
// TODO white-space VS non-word?
599626
private static final Pattern FIRST = cleanWords("first");
600627
private static final String FIRST_REPLACEMENT = cleanWordsReplacement("1st");
@@ -960,6 +987,8 @@ public static String cleanStreetTypes(@NotNull String string) {
960987
private static final String FR_CA_BOULEVARD_REPLACEMENT = cleanWordsReplacement("Boul");
961988
private static final Pattern FR_CA_CARREFOUR = cleanWordsFR("carrefour");
962989
private static final String FR_CA_CARREFOUR_REPLACEMENT = cleanWordsReplacement("Carref");
990+
private static final Pattern FR_CA_CARRE = cleanWordsFR("carr[é|e]");
991+
private static final String FR_CA_CARRE_REPLACEMENT = cleanWordsReplacement("Carr");
963992
private static final Pattern FR_CA_MONTAGNE = cleanWordsFR("montagne");
964993
private static final String FR_CA_MONTAGNE_REPLACEMENT = cleanWordsReplacement("Mgne");
965994
private static final Pattern FR_CA_MONTEE = cleanWordsFR("mont[é|e]e");
@@ -1017,6 +1046,7 @@ public static String cleanStreetTypesFRCA(@NotNull String string) {
10171046
string = FR_CA_AUTOROUTE.matcher(string).replaceAll(FR_CA_AUTOROUTE_REPLACEMENT);
10181047
string = FR_CA_BOULEVARD.matcher(string).replaceAll(FR_CA_BOULEVARD_REPLACEMENT);
10191048
string = FR_CA_CARREFOUR.matcher(string).replaceAll(FR_CA_CARREFOUR_REPLACEMENT);
1049+
string = FR_CA_CARRE.matcher(string).replaceAll(FR_CA_CARRE_REPLACEMENT);
10201050
string = FR_CA_MONTAGNE.matcher(string).replaceAll(FR_CA_MONTAGNE_REPLACEMENT);
10211051
string = FR_CA_MONTEE.matcher(string).replaceAll(FR_CA_MONTEE_REPLACEMENT);
10221052
string = FR_CA_PARC_INDUSTRIEL.matcher(string).replaceAll(FR_CA_PARC_INDUSTRIEL_REPLACEMENT);
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
@file:Suppress("FunctionName")
2+
3+
package org.mtransit.commons
4+
5+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_ARRONDISSEMENT
6+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_AV
7+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_AVENUE
8+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_BOUL
9+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_BOULEVARD
10+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_CH
11+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_CIVIQUE
12+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_CROISS
13+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_D
14+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_DE
15+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_DES
16+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_DE_L
17+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_DE_LA
18+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_DU
19+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_L
20+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_LA
21+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_LE
22+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_LES
23+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_QUARTIER
24+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_RTE
25+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_RUE
26+
import org.mtransit.commons.CleanUtils.PLACE_CHAR_TSSE
27+
28+
fun makeALL_ST_REGEX() =
29+
buildString {
30+
append("((^\\s*|/\\s*)(")
31+
append(
32+
listOf(
33+
PLACE_CHAR_ARRONDISSEMENT,
34+
PLACE_CHAR_AV,
35+
PLACE_CHAR_AVENUE,
36+
PLACE_CHAR_BOUL,
37+
PLACE_CHAR_BOULEVARD,
38+
PLACE_CHAR_CH,
39+
PLACE_CHAR_CIVIQUE,
40+
PLACE_CHAR_CROISS,
41+
PLACE_CHAR_QUARTIER,
42+
PLACE_CHAR_RTE,
43+
PLACE_CHAR_RUE,
44+
PLACE_CHAR_TSSE,
45+
).joinToString("|")
46+
)
47+
append("))")
48+
}.toRegex(setOf(RegexOption.IGNORE_CASE))
49+
50+
fun makeALL_ST_REGEX_REPLACEMENT() = "$2"
51+
52+
fun makeALL_CHARS_REGEX() =
53+
buildString {
54+
append("((^\\s*|/\\s*)(")
55+
append(
56+
listOf(
57+
PLACE_CHAR_DE_L,
58+
PLACE_CHAR_DE_LA,
59+
PLACE_CHAR_D,
60+
PLACE_CHAR_DE,
61+
PLACE_CHAR_DES,
62+
PLACE_CHAR_DU,
63+
PLACE_CHAR_LA,
64+
PLACE_CHAR_LE,
65+
PLACE_CHAR_LES,
66+
PLACE_CHAR_L,
67+
).joinToString("|")
68+
)
69+
append("))")
70+
}.toRegex(setOf(RegexOption.IGNORE_CASE))
71+
72+
fun makeALL_CHARS_REGEX_REPLACEMENT() = "$2"
73+
74+
fun makeALL_FACE_A_REGEX() =
75+
buildString {
76+
append("((^|\\s)(")
77+
append(
78+
listOf(
79+
"face à ",
80+
"face au ",
81+
"face ",
82+
).joinToString("|")
83+
)
84+
append("))")
85+
}.toRegex(setOf(RegexOption.IGNORE_CASE))
86+
87+
fun makeALL_FACE_A_REGEX_REPLACEMENT() = "$2"

src/main/java/org/mtransit/commons/StringsCleaner.kt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,10 @@ object StringsCleaner {
115115
string = CleanUtils.CLEAN_ET.matcher(string).replaceAll(CleanUtils.CLEAN_ET_REPLACEMENT)
116116
string = CleanUtils.SAINT.matcher(string).replaceAll(CleanUtils.SAINT_REPLACEMENT)
117117
string = CleanUtils.cleanStreetTypesFRCA(string)
118+
string = CleanUtils.removePointsI(string) // BEFORE next regexes
119+
string = CleanUtils.ALL_FACE_A_REGEX.replace(string, CleanUtils.ALL_FACE_A_REGEX_REPLACEMENT)
120+
string = CleanUtils.ALL_ST_REGEX.replace(string, CleanUtils.ALL_ST_REGEX_REPLACEMENT)
121+
string = CleanUtils.ALL_CHARS_REGEX.replace(string, CleanUtils.ALL_CHARS_REGEX_REPLACEMENT)
118122
}
119123
}
120124
languages?.forEach { language ->

0 commit comments

Comments
 (0)