From 199532355a6737494878fad8782a0a4c360aec4d Mon Sep 17 00:00:00 2001 From: Don Hardman Date: Fri, 20 Feb 2026 19:51:31 +0700 Subject: [PATCH] refactor(score): normalize distance map values - Add normalizeMapValues to Arrays utility - Use normalized values for distance scoring - Preserve string keys during normalization fix(client): add missing return statement in word validation --- src/ManticoreSearch/Client.php | 191 ++++++++++++++++----------------- src/Tool/Arrays.php | 52 +++++++-- 2 files changed, 138 insertions(+), 105 deletions(-) diff --git a/src/ManticoreSearch/Client.php b/src/ManticoreSearch/Client.php index 00055e6..d115533 100755 --- a/src/ManticoreSearch/Client.php +++ b/src/ManticoreSearch/Client.php @@ -525,8 +525,8 @@ protected function fetchSettings(): Settings { $settings->push( new Map( [ - 'key' => $key, - 'value' => $value, + 'key' => $key, + 'value' => $value, ] ) ); @@ -547,8 +547,8 @@ protected function fetchSettings(): Settings { $settings->push( new Map( [ - 'key' => $key, - 'value' => $value, + 'key' => $key, + 'value' => $value, ] ) ); @@ -566,7 +566,7 @@ protected function fetchSettings(): Settings { * @param bool $forceBigrams When set to true, passes "1 as force_bigrams" to all CALL SUGGEST requests * @param int $distance Maximum edit distance for suggestions * @param int $limit Maximum number of suggestions per word - * @return array{0: array, 1: array} Words and score map + * @return array{0: array, 1: Map} Words and score map */ public function fetchFuzzyVariations( string $query, @@ -585,13 +585,13 @@ public function fetchFuzzyVariations( /** @var array $words */ $words = []; - /** @var array $distanceMap */ - $distanceMap = []; - /** @var array $docMap */ - $docMap = []; + /** @var Map $distanceMap */ + $distanceMap = new Map(); + /** @var Map $docMap */ + $docMap = new Map(); // 2. For each tokenized word, we get the suggestions from the suggest function - // Track processed tokens to avoid double processing after merges + // Track processed tokens to avoid double processing after merges $processedTokens = []; $i = 0; @@ -621,16 +621,14 @@ public function fetchFuzzyVariations( - // 3. Normalize the distance and docs values - /** @var array $docMapNormalized */ - $docMapNormalized = Arrays::normalizeValues($docMap); - /** @var array $distanceMapNormalized */ - $distanceMapNormalized = Arrays::normalizeValues($distanceMap); - // Discard the original values + // 3. Normalize the distance and docs values + // 3. Normalize the distance and docs values using Map to preserve string keys + $docMapNormalized = Arrays::normalizeMapValues($docMap); + $distanceMapNormalized = Arrays::normalizeMapValues($distanceMap); + // Discard the original values unset($docMap, $distanceMap); - $scoreMap = $this->calculateScoreMap($docMapNormalized, $distanceMapNormalized); - + $scoreMap = $this->calculateScoreMapFromMap($docMapNormalized, $distanceMapNormalized); return [$words, $scoreMap]; } @@ -646,8 +644,8 @@ public function fetchFuzzyVariations( * @param int $i Current word index * @param array $normalized Array of normalized words * @param array $words Reference to words array to be populated - * @param array $distanceMap Reference to distance map to be populated - * @param array $docMap Reference to document map to be populated + * @param Map $distanceMap Reference to distance map to be populated + * @param Map $docMap Reference to document map to be populated * @param array $processedTokens Reference to processed tokens tracking * @param bool $forceBigrams When set to true, passes "1 as force_bigrams" to all CALL SUGGEST requests * @return int Number of tokens consumed (1 for individual, 2+ for merged) @@ -660,22 +658,22 @@ private function processSuggestion( int $i, array $normalized, array &$words, - array &$distanceMap, - array &$docMap, + Map &$distanceMap, + Map &$docMap, array &$processedTokens, bool $forceBigrams ): int { $forceBigramsOption = $forceBigrams ? ', 1 as force_bigrams' : ''; $query = "CALL SUGGEST( - '{$word}', - '{$table}', - {$limit} as limit, - {$distance} as max_edits, - 1 as non_char - {$forceBigramsOption} - )"; + '{$word}', + '{$table}', + {$limit} as limit, + {$distance} as max_edits, + 1 as non_char + {$forceBigramsOption} + )"; - /** + /** * @var arraysendRequest($query)->getResult(); - /** @var array $suggestions */ + /** @var array $suggestions */ $suggestions = $suggestResult[0]['data'] ?? []; $choices = []; foreach ($suggestions as $suggestion) { $suggestWord = $suggestion['suggest']; $choices[] = $suggestWord; - $distanceMap[$suggestWord] = $suggestion['distance']; - $docMap[$suggestWord] = $suggestion['docs']; + $distanceMap->put($suggestWord, $suggestion['distance']); + $docMap->put($suggestWord, $suggestion['docs']); } - // Smart merge logic - try to merge with next word if conditions are met + // Smart merge logic - try to merge with next word if conditions are met $mergeResult = $this->tryMergeWithNext( $word, $i, @@ -717,11 +715,13 @@ private function processSuggestion( $finalChoices = $mergeResult['choices']; // Add merged distance and doc mappings - foreach ($mergeResult['distanceMap'] as $suggWord => $dist) { - $distanceMap[$suggWord] = $dist; + /** @var Map $mergeDistanceMap */ + $mergeDistanceMap = $mergeResult['distanceMap']; + foreach ($mergeDistanceMap as $suggWord => $dist) { + $distanceMap->put($suggWord, $dist); } foreach ($mergeResult['docMap'] as $suggWord => $docs) { - $docMap[$suggWord] = $docs; + $docMap->put($suggWord, $docs); } // Mark next token as processed @@ -731,18 +731,18 @@ private function processSuggestion( $finalChoices = $choices; } - // Special case for empty suggestions + // Special case for empty suggestions if (!$finalChoices) { // FIX: Preserve the original word when no suggestions found // This prevents single-letter words from disappearing during combination building $finalChoices = [$finalOriginal]; - $distanceMap[$finalOriginal] = 999; - $docMap[$finalOriginal] = 0; + $distanceMap->put($finalOriginal, 999); + $docMap->put($finalOriginal, 0); } $words[] = [ - 'original' => $finalOriginal, - 'keywords' => $finalChoices, + 'original' => $finalOriginal, + 'keywords' => $finalChoices, ]; return $tokenCount; @@ -788,9 +788,8 @@ private function shouldAttemptMerge(string $word, string $nextWord): bool { * @param int $limit * @param int $distance * @param array $choices - * @param array $distanceMap - * @param bool $forceBigrams When set to true, passes "1 as force_bigrams" to all CALL SUGGEST requests - * @return array{original:string,choices:array,distanceMap:array,docMap:array}|null + * @param Map $distanceMap + * @return array{original:string,choices:array,distanceMap:Map,docMap:array}|null */ private function tryMergeWithNext( string $word, @@ -800,7 +799,7 @@ private function tryMergeWithNext( int $limit, int $distance, array $choices, - array $distanceMap, + Map $distanceMap, bool $forceBigrams ): ?array { if (!isset($normalized[$i + 1]) || !$this->shouldAttemptMerge($word, $normalized[$i + 1])) { @@ -812,16 +811,16 @@ private function tryMergeWithNext( $forceBigramsOption = $forceBigrams ? ', 1 as force_bigrams' : ''; $query = "CALL SUGGEST( - '{$combinedWord}', - '{$table}', + '{$combinedWord}', + '{$table}', {$limit} as limit, {$distance} as max_edits, 1 as non_char {$forceBigramsOption} - )"; + )"; - /** @var array}> $combinedSuggestResult */ + /** @var array}> $combinedSuggestResult */ $combinedSuggestResult = $this->sendRequest($query)->getResult(); $combinedSuggestions = $combinedSuggestResult[0]['data'] ?? []; @@ -832,34 +831,34 @@ private function tryMergeWithNext( $mergedChoices = []; - $mergedDistanceMap = []; + $mergedDistanceMap = new Map(); $mergedDocMap = []; foreach ($combinedSuggestions as $suggestion) { $combinedSuggest = $suggestion['suggest']; $mergedChoices[] = $combinedSuggest; // We add 1 here cuz we already merge with space, so the distance is the same - $mergedDistanceMap[$combinedSuggest] = $suggestion['distance'] + 1; + $mergedDistanceMap->put($combinedSuggest, $suggestion['distance'] + 1); $mergedDocMap[$combinedSuggest] = $suggestion['docs']; } - // SMART CHECK: Get suggestions for next word alone to compare - // If merged suggestions are same as next word alone, merge is useless + // SMART CHECK: Get suggestions for next word alone to compare + // If merged suggestions are same as next word alone, merge is useless $nextWordQuery = "CALL SUGGEST( - '{$nextWord}', - '{$table}', - {$limit} as limit, - {$distance} as max_edits, - 1 as non_char - {$forceBigramsOption} - )"; - - /** @var array}> $nextWordResult */ + '{$nextWord}', + '{$table}', + {$limit} as limit, + {$distance} as max_edits, + 1 as non_char + {$forceBigramsOption} + )"; + + /** @var array}> $nextWordResult */ $nextWordResult = $this->sendRequest($nextWordQuery)->getResult(); $nextWordSuggestions = $nextWordResult[0]['data'] ?? []; $nextWordChoices = array_column($nextWordSuggestions, 'suggest'); - // Compare merged vs individual quality + // Compare merged vs individual quality if (!$this->shouldUseMergedResult( $choices, $mergedChoices, @@ -873,32 +872,32 @@ private function tryMergeWithNext( } return [ - 'original' => $combinedWord, - 'choices' => $mergedChoices, + 'original' => $combinedWord, + 'choices' => $mergedChoices, 'distanceMap' => $mergedDistanceMap, 'docMap' => $mergedDocMap, ]; } -/** - * Compare merged vs individual results quality - * SMART LOGIC: Check if merge actually provides value or just matches the next word - * - * @param array $individualChoices Suggestions for first word - * @param array $mergedChoices Suggestions for merged word - * @param array $individualDistanceMap Distance map for first word - * @param array $mergedDistanceMap Distance map for merged word - * @param array $nextWordChoices Suggestions for second word alone - * @param string $word First word - * @param string $nextWord Second word - * @return bool - */ + /** + * Compare merged vs individual results quality + * SMART LOGIC: Check if merge actually provides value or just matches the next word + * + * @param array $individualChoices Suggestions for first word + * @param array $mergedChoices Suggestions for merged word + * @param Map $individualDistanceMap Distance map for first word + * @param Map $mergedDistanceMap Distance map for merged word + * @param array $nextWordChoices Suggestions for second word alone + * @param string $word First word + * @param string $nextWord Second word + * @return bool + */ private function shouldUseMergedResult( array $individualChoices, array $mergedChoices, - array $individualDistanceMap, - array $mergedDistanceMap, + Map $individualDistanceMap, + Map $mergedDistanceMap, array $nextWordChoices = [], string $word = '', string $nextWord = '' @@ -926,10 +925,10 @@ private function shouldUseMergedResult( // Both have suggestions - compare quality if (!empty($mergedChoices)) { - $avgMergedDistance = array_sum($mergedDistanceMap) / sizeof($mergedDistanceMap); - $avgIndividualDistance = empty($individualDistanceMap) + $avgMergedDistance = array_sum($mergedDistanceMap->values()->toArray()) / sizeof($mergedDistanceMap); + $avgIndividualDistance = $individualDistanceMap->isEmpty() ? 999 - : array_sum($individualDistanceMap) / sizeof($individualDistanceMap); + : array_sum($individualDistanceMap->values()->toArray()) / $individualDistanceMap->count(); // Prefer merged if distance is not significantly worse (allow +1.5 penalty for merging) return $avgMergedDistance <= $avgIndividualDistance + 1.5; @@ -940,32 +939,28 @@ private function shouldUseMergedResult( return false; } - - /** - * Calculates the score map based on normalized distance and document scores. + * Calculate score map from Ds\Map - preserves string keys. * - * @param array $docMapNormalized Normalized document scores - * @param array $distanceMapNormalized Normalized distance scores - * @return array Score map with calculated scores + * @param Map $docMapNormalized + * @param Map $distanceMapNormalized + * @return Map */ - private function calculateScoreMap(array $docMapNormalized, array $distanceMapNormalized): array { - // We are use minimum distance to avoid siutation when less docs affect relevance + private function calculateScoreMapFromMap(Map $docMapNormalized, Map $distanceMapNormalized): Map { $scoreFn = static function (float $distance, float $docs): float { return (float)max($distance + 1, sqrt($docs)) / ($distance + 1); }; - /** @var array $scoreMap */ - $scoreMap = []; + $result = new Map(); foreach ($docMapNormalized as $word => $docScore) { - if (!isset($distanceMapNormalized[$word])) { + if (!$distanceMapNormalized->hasKey($word)) { continue; } - $distanceScore = $distanceMapNormalized[$word]; - $scoreMap[$word] = $scoreFn($docScore, $distanceScore); + $distanceScore = $distanceMapNormalized->get($word); + $result->put($word, $scoreFn($docScore, $distanceScore)); } - return $scoreMap; + return $result; } } diff --git a/src/Tool/Arrays.php b/src/Tool/Arrays.php index abc0bba..ab7841c 100644 --- a/src/Tool/Arrays.php +++ b/src/Tool/Arrays.php @@ -1,12 +1,12 @@ $map + * @return \Ds\Map + */ + public static function normalizeMapValues(\Ds\Map $map): \Ds\Map { + if ($map->isEmpty()) { + return new \Ds\Map(); + } + + $min = $max = null; + foreach ($map as $value) { + $value = (float)$value; + if ($min === null || $value < $min) { + $min = $value; + } + if ($max !== null && $value <= $max) { + continue; + } + + $max = $value; + } + + $result = new \Ds\Map(); + $diff = $max - $min; + foreach ($map as $key => $value) { + $result->put($key, $diff === 0.0 ? 1.0 : ($value - $min) / $diff); + } + + return $result; + } + /** * Run normalization on the array values * In case empty array passed, return empty array @@ -140,13 +174,17 @@ public static function blend(array ...$arrays): array { } /** - * @param array ...$arrays + * @param array|\Ds\Map ...$arrays * @return array */ - public static function getMapSum(array ...$arrays): array { + public static function getMapSum(array|\Ds\Map ...$arrays): array { $result = []; foreach ($arrays as $array) { + // Convert Map to array if needed + if ($array instanceof \Ds\Map) { + $array = $array->toArray(); + } foreach ($array as $key => $value) { if (!isset($result[$key])) { $result[$key] = 0.0;