diff --git a/src/main/java/org/mtransit/commons/CleanUtils.java b/src/main/java/org/mtransit/commons/CleanUtils.java index 683ad3a..ebb0453 100644 --- a/src/main/java/org/mtransit/commons/CleanUtils.java +++ b/src/main/java/org/mtransit/commons/CleanUtils.java @@ -20,6 +20,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import kotlin.text.Regex; + @SuppressWarnings({"unused", "WeakerAccess"}) public final class CleanUtils { @@ -101,17 +103,19 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) { return label.trim(); } - private static final String PLACE_CHAR_DE_L = "de l'"; - private static final String PLACE_CHAR_DE_LA = "de la "; - private static final String PLACE_CHAR_D = "d'"; - private static final String PLACE_CHAR_DE = "de "; - private static final String PLACE_CHAR_DES = "des "; - private static final String PLACE_CHAR_DU = "du "; - private static final String PLACE_CHAR_LA = "la "; - private static final String PLACE_CHAR_LE = "le "; - private static final String PLACE_CHAR_LES = "les "; - private static final String PLACE_CHAR_L = "l'"; - + static final String PLACE_CHAR_DE_L = "de l'"; + static final String PLACE_CHAR_DE_LA = "de la "; + static final String PLACE_CHAR_D = "d'"; + static final String PLACE_CHAR_DE = "de "; + static final String PLACE_CHAR_DES = "des "; + static final String PLACE_CHAR_DU = "du "; + static final String PLACE_CHAR_LA = "la "; + static final String PLACE_CHAR_LE = "le "; + static final String PLACE_CHAR_LES = "les "; + static final String PLACE_CHAR_L = "l'"; + + @SuppressWarnings("DeprecatedIsStillUsed") + @Deprecated private static final Pattern[] START_WITH_CHARS = new Pattern[]{ // Pattern.compile("^(" + PLACE_CHAR_DE_L + ")", Pattern.CASE_INSENSITIVE), // Pattern.compile("^(" + PLACE_CHAR_DE_LA + ")", Pattern.CASE_INSENSITIVE), // @@ -125,6 +129,8 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) { Pattern.compile("^(" + PLACE_CHAR_L + ")", Pattern.CASE_INSENSITIVE) // }; + @SuppressWarnings("unused") + @Deprecated public static final Pattern[] SPACE_CHARS = new Pattern[]{ // Pattern.compile("( " + PLACE_CHAR_DE_L + ")", Pattern.CASE_INSENSITIVE), // Pattern.compile("( " + PLACE_CHAR_DE_LA + ")", Pattern.CASE_INSENSITIVE), // @@ -138,6 +144,8 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) { Pattern.compile("( " + PLACE_CHAR_L + ")", Pattern.CASE_INSENSITIVE) // }; + @SuppressWarnings("DeprecatedIsStillUsed") + @Deprecated private static final Pattern[] SLASH_CHARS = new Pattern[]{// Pattern.compile("(/ " + PLACE_CHAR_DE_L + ")", Pattern.CASE_INSENSITIVE), // Pattern.compile("(/ " + PLACE_CHAR_DE_LA + ")", Pattern.CASE_INSENSITIVE), // @@ -151,19 +159,24 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) { Pattern.compile("(/ " + PLACE_CHAR_L + ")", Pattern.CASE_INSENSITIVE) // }; - private static final String PLACE_CHAR_ARRONDISSEMENT = "arrondissement "; - private static final String PLACE_CHAR_AV = "av "; - private static final String PLACE_CHAR_AVENUE = "avenue "; - private static final String PLACE_CHAR_BOUL = "boul "; - private static final String PLACE_CHAR_BOULEVARD = "boulevard "; - private static final String PLACE_CHAR_CH = "ch "; - private static final String PLACE_CHAR_CIVIQUE = "civique "; - private static final String PLACE_CHAR_CROISS = "croiss "; - private static final String PLACE_CHAR_QUARTIER = "quartier "; - private static final String PLACE_CHAR_RTE = "rte "; - private static final String PLACE_CHAR_RUE = "rue "; - private static final String PLACE_CHAR_TSSE = "tsse "; - + public static final Regex ALL_CHARS_REGEX = CleanUtilsExtKt.makeALL_CHARS_REGEX(); + public static final String ALL_CHARS_REGEX_REPLACEMENT = CleanUtilsExtKt.makeALL_CHARS_REGEX_REPLACEMENT(); + + static final String PLACE_CHAR_ARRONDISSEMENT = "arrondissement "; + static final String PLACE_CHAR_AV = "av "; + static final String PLACE_CHAR_AVENUE = "avenue "; + static final String PLACE_CHAR_BOUL = "boul "; + static final String PLACE_CHAR_BOULEVARD = "boulevard "; + static final String PLACE_CHAR_CH = "ch "; + static final String PLACE_CHAR_CIVIQUE = "civique "; + static final String PLACE_CHAR_CROISS = "croiss "; + static final String PLACE_CHAR_QUARTIER = "quartier "; + static final String PLACE_CHAR_RTE = "rte "; + static final String PLACE_CHAR_RUE = "rue "; + static final String PLACE_CHAR_TSSE = "tsse "; + + @SuppressWarnings("DeprecatedIsStillUsed") + @Deprecated private static final Pattern[] START_WITH_ST = new Pattern[]{ // Pattern.compile("^(" + PLACE_CHAR_ARRONDISSEMENT + ")", Pattern.CASE_INSENSITIVE), // Pattern.compile("^(" + PLACE_CHAR_AV + ")", Pattern.CASE_INSENSITIVE), // @@ -179,6 +192,8 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) { Pattern.compile("^(" + PLACE_CHAR_TSSE + ")", Pattern.CASE_INSENSITIVE) // }; + @SuppressWarnings("unused") + @Deprecated public static final Pattern[] SPACE_ST = new Pattern[]{ // Pattern.compile("( " + PLACE_CHAR_ARRONDISSEMENT + ")", Pattern.CASE_INSENSITIVE), // Pattern.compile("( " + PLACE_CHAR_AV + ")", Pattern.CASE_INSENSITIVE), // @@ -194,6 +209,8 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) { Pattern.compile("( " + PLACE_CHAR_TSSE + ")", Pattern.CASE_INSENSITIVE) // }; + @SuppressWarnings("DeprecatedIsStillUsed") + @Deprecated private static final Pattern[] SLASH_ST = new Pattern[]{ // Pattern.compile("(/ " + PLACE_CHAR_ARRONDISSEMENT + ")", Pattern.CASE_INSENSITIVE), // Pattern.compile("(/ " + PLACE_CHAR_AV + ")", Pattern.CASE_INSENSITIVE), // @@ -209,6 +226,9 @@ public static String cleanLabel(@NotNull Locale locale, @NotNull String label) { Pattern.compile("(/ " + PLACE_CHAR_TSSE + ")", Pattern.CASE_INSENSITIVE) // }; + public static final Regex ALL_ST_REGEX = CleanUtilsExtKt.makeALL_ST_REGEX(); + public static final String ALL_ST_REGEX_REPLACEMENT = CleanUtilsExtKt.makeALL_ST_REGEX_REPLACEMENT(); + @NotNull public static Pattern cleanWord(@NotNull String word) { return cleanWords(word); @@ -355,9 +375,13 @@ public static String cleanLabelFR(@NotNull String label) { label = CLEAN_PARENTHESIS2.matcher(label).replaceAll(CLEAN_PARENTHESIS2_REPLACEMENT); label = SAINT.matcher(label).replaceAll(SAINT_REPLACEMENT); label = removePointsI(label); // after capitalize + //noinspection deprecation label = RegexUtils.replaceAllNN(label.trim(), START_WITH_ST, SPACE); // Constants.EMPTY); // SPACE); + //noinspection deprecation label = RegexUtils.replaceAllNN(label, SLASH_ST, SLASH_SPACE); + //noinspection deprecation label = RegexUtils.replaceAllNN(label.trim(), START_WITH_CHARS, SPACE); // , Constants.EMPTY); // + //noinspection deprecation label = RegexUtils.replaceAllNN(label, SLASH_CHARS, SLASH_SPACE); return cleanLabel(Locale.FRENCH, label); } @@ -595,6 +619,9 @@ private static boolean containsIgnoreCase(@Nullable String string, @NotNull Stri return false; } + public static final Regex ALL_FACE_A_REGEX = CleanUtilsExtKt.makeALL_FACE_A_REGEX(); + public static final String ALL_FACE_A_REGEX_REPLACEMENT = CleanUtilsExtKt.makeALL_FACE_A_REGEX_REPLACEMENT(); + // TODO white-space VS non-word? private static final Pattern FIRST = cleanWords("first"); private static final String FIRST_REPLACEMENT = cleanWordsReplacement("1st"); @@ -960,6 +987,8 @@ public static String cleanStreetTypes(@NotNull String string) { private static final String FR_CA_BOULEVARD_REPLACEMENT = cleanWordsReplacement("Boul"); private static final Pattern FR_CA_CARREFOUR = cleanWordsFR("carrefour"); private static final String FR_CA_CARREFOUR_REPLACEMENT = cleanWordsReplacement("Carref"); + private static final Pattern FR_CA_CARRE = cleanWordsFR("carr[é|e]"); + private static final String FR_CA_CARRE_REPLACEMENT = cleanWordsReplacement("Carr"); private static final Pattern FR_CA_MONTAGNE = cleanWordsFR("montagne"); private static final String FR_CA_MONTAGNE_REPLACEMENT = cleanWordsReplacement("Mgne"); private static final Pattern FR_CA_MONTEE = cleanWordsFR("mont[é|e]e"); @@ -1017,6 +1046,7 @@ public static String cleanStreetTypesFRCA(@NotNull String string) { string = FR_CA_AUTOROUTE.matcher(string).replaceAll(FR_CA_AUTOROUTE_REPLACEMENT); string = FR_CA_BOULEVARD.matcher(string).replaceAll(FR_CA_BOULEVARD_REPLACEMENT); string = FR_CA_CARREFOUR.matcher(string).replaceAll(FR_CA_CARREFOUR_REPLACEMENT); + string = FR_CA_CARRE.matcher(string).replaceAll(FR_CA_CARRE_REPLACEMENT); string = FR_CA_MONTAGNE.matcher(string).replaceAll(FR_CA_MONTAGNE_REPLACEMENT); string = FR_CA_MONTEE.matcher(string).replaceAll(FR_CA_MONTEE_REPLACEMENT); string = FR_CA_PARC_INDUSTRIEL.matcher(string).replaceAll(FR_CA_PARC_INDUSTRIEL_REPLACEMENT); diff --git a/src/main/java/org/mtransit/commons/CleanUtilsExt.kt b/src/main/java/org/mtransit/commons/CleanUtilsExt.kt new file mode 100644 index 0000000..0864d2f --- /dev/null +++ b/src/main/java/org/mtransit/commons/CleanUtilsExt.kt @@ -0,0 +1,87 @@ +@file:Suppress("FunctionName") + +package org.mtransit.commons + +import org.mtransit.commons.CleanUtils.PLACE_CHAR_ARRONDISSEMENT +import org.mtransit.commons.CleanUtils.PLACE_CHAR_AV +import org.mtransit.commons.CleanUtils.PLACE_CHAR_AVENUE +import org.mtransit.commons.CleanUtils.PLACE_CHAR_BOUL +import org.mtransit.commons.CleanUtils.PLACE_CHAR_BOULEVARD +import org.mtransit.commons.CleanUtils.PLACE_CHAR_CH +import org.mtransit.commons.CleanUtils.PLACE_CHAR_CIVIQUE +import org.mtransit.commons.CleanUtils.PLACE_CHAR_CROISS +import org.mtransit.commons.CleanUtils.PLACE_CHAR_D +import org.mtransit.commons.CleanUtils.PLACE_CHAR_DE +import org.mtransit.commons.CleanUtils.PLACE_CHAR_DES +import org.mtransit.commons.CleanUtils.PLACE_CHAR_DE_L +import org.mtransit.commons.CleanUtils.PLACE_CHAR_DE_LA +import org.mtransit.commons.CleanUtils.PLACE_CHAR_DU +import org.mtransit.commons.CleanUtils.PLACE_CHAR_L +import org.mtransit.commons.CleanUtils.PLACE_CHAR_LA +import org.mtransit.commons.CleanUtils.PLACE_CHAR_LE +import org.mtransit.commons.CleanUtils.PLACE_CHAR_LES +import org.mtransit.commons.CleanUtils.PLACE_CHAR_QUARTIER +import org.mtransit.commons.CleanUtils.PLACE_CHAR_RTE +import org.mtransit.commons.CleanUtils.PLACE_CHAR_RUE +import org.mtransit.commons.CleanUtils.PLACE_CHAR_TSSE + +fun makeALL_ST_REGEX() = + buildString { + append("((^\\s*|/\\s*)(") + append( + listOf( + PLACE_CHAR_ARRONDISSEMENT, + PLACE_CHAR_AV, + PLACE_CHAR_AVENUE, + PLACE_CHAR_BOUL, + PLACE_CHAR_BOULEVARD, + PLACE_CHAR_CH, + PLACE_CHAR_CIVIQUE, + PLACE_CHAR_CROISS, + PLACE_CHAR_QUARTIER, + PLACE_CHAR_RTE, + PLACE_CHAR_RUE, + PLACE_CHAR_TSSE, + ).joinToString("|") + ) + append("))") + }.toRegex(setOf(RegexOption.IGNORE_CASE)) + +fun makeALL_ST_REGEX_REPLACEMENT() = "$2" + +fun makeALL_CHARS_REGEX() = + buildString { + append("((^\\s*|/\\s*)(") + append( + listOf( + PLACE_CHAR_DE_L, + PLACE_CHAR_DE_LA, + PLACE_CHAR_D, + PLACE_CHAR_DE, + PLACE_CHAR_DES, + PLACE_CHAR_DU, + PLACE_CHAR_LA, + PLACE_CHAR_LE, + PLACE_CHAR_LES, + PLACE_CHAR_L, + ).joinToString("|") + ) + append("))") + }.toRegex(setOf(RegexOption.IGNORE_CASE)) + +fun makeALL_CHARS_REGEX_REPLACEMENT() = "$2" + +fun makeALL_FACE_A_REGEX() = + buildString { + append("((^|\\s)(") + append( + listOf( + "face à ", + "face au ", + "face ", + ).joinToString("|") + ) + append("))") + }.toRegex(setOf(RegexOption.IGNORE_CASE)) + +fun makeALL_FACE_A_REGEX_REPLACEMENT() = "$2" diff --git a/src/main/java/org/mtransit/commons/StringsCleaner.kt b/src/main/java/org/mtransit/commons/StringsCleaner.kt index b27aac6..ecc07f9 100644 --- a/src/main/java/org/mtransit/commons/StringsCleaner.kt +++ b/src/main/java/org/mtransit/commons/StringsCleaner.kt @@ -115,6 +115,10 @@ object StringsCleaner { string = CleanUtils.CLEAN_ET.matcher(string).replaceAll(CleanUtils.CLEAN_ET_REPLACEMENT) string = CleanUtils.SAINT.matcher(string).replaceAll(CleanUtils.SAINT_REPLACEMENT) string = CleanUtils.cleanStreetTypesFRCA(string) + string = CleanUtils.removePointsI(string) // BEFORE next regexes + string = CleanUtils.ALL_FACE_A_REGEX.replace(string, CleanUtils.ALL_FACE_A_REGEX_REPLACEMENT) + string = CleanUtils.ALL_ST_REGEX.replace(string, CleanUtils.ALL_ST_REGEX_REPLACEMENT) + string = CleanUtils.ALL_CHARS_REGEX.replace(string, CleanUtils.ALL_CHARS_REGEX_REPLACEMENT) } } languages?.forEach { language ->