Skip to content

Commit ff1c331

Browse files
committed
feat: Enhance product identity resolution and category metadata for smartphones
1 parent ec92224 commit ff1c331

8 files changed

Lines changed: 673 additions & 13 deletions

File tree

ProductNormaliser.Application.Tests/IdentityAndMergeTests.cs

Lines changed: 525 additions & 0 deletions
Large diffs are not rendered by default.

ProductNormaliser.Domain.Tests/DefaultCategoryMetadataCatalogTests.cs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,23 @@ public void GetAll_EnablesSupportedAndExperimentalActiveCategories()
6565
Assert.Multiple(() =>
6666
{
6767
Assert.That(enabledCategories.Select(category => category.CategoryKey), Is.EqualTo(new[] { "tv", "monitor", "laptop", "tablet", "smartphone", "headphones", "speakers" }));
68-
Assert.That(enabledCategories.Count(category => category.CrawlSupportStatus == CrawlSupportStatus.Supported), Is.EqualTo(3));
69-
Assert.That(enabledCategories.Count(category => category.CrawlSupportStatus == CrawlSupportStatus.Experimental), Is.EqualTo(4));
68+
Assert.That(enabledCategories.Count(category => category.CrawlSupportStatus == CrawlSupportStatus.Supported), Is.EqualTo(4));
69+
Assert.That(enabledCategories.Count(category => category.CrawlSupportStatus == CrawlSupportStatus.Experimental), Is.EqualTo(3));
70+
});
71+
}
72+
73+
[Test]
74+
public void GetByKey_ReturnsSmartphoneAsSupportedEnabledCategory()
75+
{
76+
var category = DefaultCategoryMetadataCatalog.GetByKey("smartphone");
77+
78+
Assert.That(category, Is.Not.Null);
79+
Assert.Multiple(() =>
80+
{
81+
Assert.That(category!.DisplayName, Is.EqualTo("Smartphones"));
82+
Assert.That(category.CrawlSupportStatus, Is.EqualTo(CrawlSupportStatus.Supported));
83+
Assert.That(category.SchemaCompletenessScore, Is.EqualTo(0.91m));
84+
Assert.That(category.IsEnabled, Is.True);
7085
});
7186
}
7287
}

ProductNormaliser.Domain/Merging/ProductIdentityResolver.cs

Lines changed: 120 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,15 @@ public sealed class ProductIdentityResolver(
1010
ConfidenceScorer? confidenceScorer = null,
1111
ICategoryAttributeNormaliserRegistry? categoryAttributeNormaliserRegistry = null) : IProductIdentityResolver
1212
{
13+
private const string SmartphoneCategoryKey = "smartphone";
14+
private static readonly IReadOnlyDictionary<string, string[]> StrongDisambiguatorKeysByCategory = new Dictionary<string, string[]>(StringComparer.OrdinalIgnoreCase)
15+
{
16+
[SmartphoneCategoryKey] = ["storage_capacity_gb", "manufacturer_part_number", "regional_variant", "carrier_lock_status"],
17+
["tablet"] = ["connectivity", "cellular_generation", "manufacturer_part_number", "regional_variant", "storage_capacity_gb"],
18+
["headphones"] = ["connection_type", "manufacturer_part_number"],
19+
["speakers"] = ["connection_type", "speaker_type", "manufacturer_part_number"]
20+
};
21+
1322
private readonly ProductFingerprintBuilder fingerprintBuilder = fingerprintBuilder ?? new ProductFingerprintBuilder();
1423
private readonly ConfidenceScorer confidenceScorer = confidenceScorer ?? new ConfidenceScorer();
1524
private readonly ICategoryAttributeNormaliserRegistry categoryAttributeNormaliserRegistry = categoryAttributeNormaliserRegistry ?? DefaultCategoryRegistries.CreateAttributeNormaliserRegistry();
@@ -46,7 +55,15 @@ public ProductIdentityMatchResult Match(SourceProduct sourceProduct, IReadOnlyCo
4655
};
4756
}
4857

49-
var brandModelMatch = candidates.FirstOrDefault(candidate =>
58+
var manufacturerPartNumberMatch = FindManufacturerPartNumberMatch(sourceProduct, candidates);
59+
if (manufacturerPartNumberMatch is not null)
60+
{
61+
return manufacturerPartNumberMatch;
62+
}
63+
64+
var broaderCandidates = FilterCandidatesForBroaderMatching(sourceProduct, candidates);
65+
66+
var brandModelMatch = broaderCandidates.FirstOrDefault(candidate =>
5067
string.Equals(candidate.Brand?.Trim(), sourceProduct.Brand?.Trim(), StringComparison.OrdinalIgnoreCase)
5168
&& string.Equals(candidate.ModelNumber?.Trim(), sourceProduct.ModelNumber?.Trim(), StringComparison.OrdinalIgnoreCase)
5269
&& !string.IsNullOrWhiteSpace(candidate.Brand)
@@ -63,13 +80,13 @@ public ProductIdentityMatchResult Match(SourceProduct sourceProduct, IReadOnlyCo
6380
};
6481
}
6582

66-
var categoryIdentityMatch = FindCategoryIdentityMatch(sourceProduct, candidates);
83+
var categoryIdentityMatch = FindCategoryIdentityMatch(sourceProduct, broaderCandidates);
6784
if (categoryIdentityMatch is not null)
6885
{
6986
return categoryIdentityMatch;
7087
}
7188

72-
var bestSimilarityMatch = candidates
89+
var bestSimilarityMatch = broaderCandidates
7390
.Select(candidate => new
7491
{
7592
Candidate = candidate,
@@ -89,6 +106,17 @@ public ProductIdentityMatchResult Match(SourceProduct sourceProduct, IReadOnlyCo
89106
};
90107
}
91108

109+
var strongConflictReason = GetStrongVariantConflictReason(sourceProduct, candidates);
110+
if (strongConflictReason is not null)
111+
{
112+
return new ProductIdentityMatchResult
113+
{
114+
IsMatch = false,
115+
Confidence = 0.00m,
116+
MatchReason = strongConflictReason
117+
};
118+
}
119+
92120
return new ProductIdentityMatchResult
93121
{
94122
IsMatch = false,
@@ -97,6 +125,31 @@ public ProductIdentityMatchResult Match(SourceProduct sourceProduct, IReadOnlyCo
97125
};
98126
}
99127

128+
private ProductIdentityMatchResult? FindManufacturerPartNumberMatch(SourceProduct sourceProduct, IReadOnlyCollection<CanonicalProduct> candidates)
129+
{
130+
if (!TryGetStrongDisambiguatorKeys(sourceProduct.CategoryKey, out _)
131+
|| !TryGetComparableValue(sourceProduct, "manufacturer_part_number", out var sourceManufacturerPartNumber))
132+
{
133+
return null;
134+
}
135+
136+
var match = candidates.FirstOrDefault(candidate =>
137+
string.Equals(candidate.CategoryKey, sourceProduct.CategoryKey, StringComparison.OrdinalIgnoreCase)
138+
&& TryGetComparableValue(candidate, "manufacturer_part_number", out var candidateManufacturerPartNumber)
139+
&& string.Equals(candidateManufacturerPartNumber, sourceManufacturerPartNumber, StringComparison.OrdinalIgnoreCase)
140+
&& PrimaryIdentitySignalsAreCompatible(sourceProduct, candidate));
141+
142+
return match is null
143+
? null
144+
: new ProductIdentityMatchResult
145+
{
146+
CanonicalProductId = match.Id,
147+
IsMatch = true,
148+
Confidence = 0.98m,
149+
MatchReason = "Exact manufacturer part number match."
150+
};
151+
}
152+
100153
private ProductIdentityMatchResult? FindCategoryIdentityMatch(SourceProduct sourceProduct, IReadOnlyCollection<CanonicalProduct> candidates)
101154
{
102155
var identityKeys = categoryAttributeNormaliserRegistry
@@ -138,6 +191,70 @@ public ProductIdentityMatchResult Match(SourceProduct sourceProduct, IReadOnlyCo
138191
};
139192
}
140193

194+
private IReadOnlyCollection<CanonicalProduct> FilterCandidatesForBroaderMatching(SourceProduct sourceProduct, IReadOnlyCollection<CanonicalProduct> candidates)
195+
{
196+
if (!TryGetStrongDisambiguatorKeys(sourceProduct.CategoryKey, out _))
197+
{
198+
return candidates;
199+
}
200+
201+
return candidates
202+
.Where(candidate => !HasStrongVariantConflict(sourceProduct, candidate))
203+
.ToArray();
204+
}
205+
206+
private static bool PrimaryIdentitySignalsAreCompatible(SourceProduct sourceProduct, CanonicalProduct candidate)
207+
{
208+
return !HasComparableConflict(sourceProduct, candidate, "brand")
209+
&& !HasComparableConflict(sourceProduct, candidate, "model_number");
210+
}
211+
212+
private static bool HasStrongVariantConflict(SourceProduct sourceProduct, CanonicalProduct candidate)
213+
{
214+
if (!TryGetStrongDisambiguatorKeys(sourceProduct.CategoryKey, out var disambiguatorKeys)
215+
|| !string.Equals(candidate.CategoryKey, sourceProduct.CategoryKey, StringComparison.OrdinalIgnoreCase))
216+
{
217+
return false;
218+
}
219+
220+
return disambiguatorKeys.Any(key => HasComparableConflict(sourceProduct, candidate, key));
221+
}
222+
223+
private static string? GetStrongVariantConflictReason(SourceProduct sourceProduct, IReadOnlyCollection<CanonicalProduct> candidates)
224+
{
225+
if (!TryGetStrongDisambiguatorKeys(sourceProduct.CategoryKey, out _)
226+
|| !candidates.Any(candidate => HasStrongVariantConflict(sourceProduct, candidate)))
227+
{
228+
return null;
229+
}
230+
231+
return sourceProduct.CategoryKey.ToLowerInvariant() switch
232+
{
233+
SmartphoneCategoryKey => "Strong smartphone variant conflict prevented a safe match.",
234+
"tablet" => "Strong tablet variant conflict prevented a safe match.",
235+
"headphones" => "Strong headphones variant conflict prevented a safe match.",
236+
"speakers" => "Strong speakers variant conflict prevented a safe match.",
237+
_ => "Strong category variant conflict prevented a safe match."
238+
};
239+
}
240+
241+
private static bool HasComparableConflict(SourceProduct sourceProduct, CanonicalProduct candidate, string key)
242+
{
243+
return TryGetComparableValue(sourceProduct, key, out var sourceValue)
244+
&& TryGetComparableValue(candidate, key, out var candidateValue)
245+
&& !string.Equals(sourceValue, candidateValue, StringComparison.OrdinalIgnoreCase);
246+
}
247+
248+
private static bool IsSmartphoneCategory(string categoryKey)
249+
{
250+
return string.Equals(categoryKey, SmartphoneCategoryKey, StringComparison.OrdinalIgnoreCase);
251+
}
252+
253+
private static bool TryGetStrongDisambiguatorKeys(string categoryKey, out string[] keys)
254+
{
255+
return StrongDisambiguatorKeysByCategory.TryGetValue(categoryKey, out keys!);
256+
}
257+
141258
private static int CountMatchingIdentityAttributes(SourceProduct sourceProduct, CanonicalProduct candidate, IEnumerable<string> identityKeys)
142259
{
143260
return identityKeys.Count(key => TryGetComparableValue(sourceProduct, key, out var sourceValue)

ProductNormaliser.Domain/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ The goal is not just to pick a winner, but to pick a winner with an audit trail.
7171

7272
## Category schema
7373

74-
The first fully modelled category is `tv` in the TV schema provider. Additional category providers for `monitor`, `laptop`, and `refrigerator` now exist so the rest of the platform can score completeness, route normalisation, and expose dashboard metadata without assuming TV-only semantics.
74+
The first fully modelled category is `tv` in the TV schema provider. Additional category providers now exist for `monitor`, `laptop`, `smartphone`, `tablet`, `headphones`, `speakers`, and `refrigerator` so the rest of the platform can score completeness, route normalisation, and expose dashboard metadata without assuming TV-only semantics.
7575

7676
The TV schema still defines the richest required and optional canonical attributes such as:
7777

@@ -91,6 +91,8 @@ Adding a new category typically means:
9191
3. deciding identity heuristics and attribute reliability rules for that category
9292
4. registering metadata so the admin API and web dashboard can discover the category safely
9393

94+
At the current maturity line, `tv`, `monitor`, `laptop`, and `smartphone` are the supported categories with the strongest schema, normalisation, and identity coverage. `tablet`, `headphones`, and `speakers` are enabled experimental categories with broader canonical field sets and category-specific normalisers, but they are still being hardened before promotion.
95+
9496
## How other projects use Domain
9597

9698
- Infrastructure implements many of the Domain interfaces and persists Domain models.

ProductNormaliser.Domain/Schemas/DefaultCategoryMetadataCatalog.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ public static class DefaultCategoryMetadataCatalog
1010
Create("monitor", "Monitors", "display", "Display", "monitor", CrawlSupportStatus.Supported, 0.89m, true),
1111
Create("laptop", "Laptops", "computing", "Computing", "laptop", CrawlSupportStatus.Supported, 0.87m, true),
1212
Create("tablet", "Tablets", "mobile", "Mobile", "tablet", CrawlSupportStatus.Experimental, 0.78m, true),
13-
Create("smartphone", "Smartphones", "mobile", "Mobile", "smartphone", CrawlSupportStatus.Experimental, 0.79m, true),
13+
Create("smartphone", "Smartphones", "mobile", "Mobile", "smartphone", CrawlSupportStatus.Supported, 0.91m, true),
1414
Create("headphones", "Headphones", "audio", "Audio", "headphones", CrawlSupportStatus.Experimental, 0.74m, true),
1515
Create("speakers", "Speakers", "audio", "Audio", "speaker", CrawlSupportStatus.Experimental, 0.72m, true),
1616
Create("refrigerator", "Refrigerators", "kitchen_appliances", "Kitchen Appliances", "refrigerator", CrawlSupportStatus.Planned, 0.10m, false),

ProductNormaliser.Web.Tests/ProductPageRenderingTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ public async Task ExplorerPage_RendersEnabledSupportedAndExperimentalCategoryOpt
9191
new CategoryMetadataDto { CategoryKey = "monitor", DisplayName = "Monitors", IsEnabled = true, CrawlSupportStatus = "Supported" },
9292
new CategoryMetadataDto { CategoryKey = "laptop", DisplayName = "Laptops", IsEnabled = true, CrawlSupportStatus = "Supported" },
9393
new CategoryMetadataDto { CategoryKey = "tablet", DisplayName = "Tablets", IsEnabled = true, CrawlSupportStatus = "Experimental" },
94-
new CategoryMetadataDto { CategoryKey = "smartphone", DisplayName = "Smartphones", IsEnabled = true, CrawlSupportStatus = "Experimental" },
94+
new CategoryMetadataDto { CategoryKey = "smartphone", DisplayName = "Smartphones", IsEnabled = true, CrawlSupportStatus = "Supported" },
9595
new CategoryMetadataDto { CategoryKey = "headphones", DisplayName = "Headphones", IsEnabled = true, CrawlSupportStatus = "Experimental" },
9696
new CategoryMetadataDto { CategoryKey = "speakers", DisplayName = "Speakers", IsEnabled = true, CrawlSupportStatus = "Experimental" },
9797
new CategoryMetadataDto { CategoryKey = "refrigerator", DisplayName = "Refrigerators", IsEnabled = false, CrawlSupportStatus = "Planned" }

ProductNormaliser.Web/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ The web host currently delivers:
99
- an operator landing page that keeps the active category context visible
1010
- an operator landing page operational health panel for queue depth, retry backlog, recent failures, at-risk sources, and category pressure
1111
- an operator landing page boot-and-populate panel showing boot-ready sources, categories in context, estimated discovery seeds, and recent confirmed-product throughput
12-
- category selection for the rollout set: TVs, Monitors, and Laptops
12+
- category selection for the current supported set: TVs, Monitors, Laptops, and Smartphones
13+
- category selectors and explorer filters that continue to expose enabled experimental categories such as Tablets, Headphones, and Speakers with their maturity badges intact
1314
- seeded crawl launch and crawl-job monitoring with discovery and product progress shown together
1415
- canonical product exploration with quality-aware filters and paging
1516
- product detail pages with source comparison, evidence, conflicts, and history

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
ProductNormaliser is an open product-intelligence engine for turning messy retail and manufacturer page data into clean, canonical, comparable product records. It crawls source pages, extracts structured product evidence, normalises attributes into a category schema, resolves identity across sources, merges competing claims into a canonical product, and keeps learning over time from quality history, disagreement patterns, and page volatility.
44

5-
Milestone 1 is centered on an end-to-end operator workflow for three rollout categories: `tv`, `monitor`, and `laptop`. The platform still keeps category and normalisation extension points broad enough for future electrical-goods expansion, but the completed milestone scope is the crawl, management, product, and quality experience for those three categories.
5+
Milestone 1 is centered on an end-to-end operator workflow for four supported categories: `tv`, `monitor`, `laptop`, and `smartphone`. The platform still keeps category and normalisation extension points broad enough for broader electrical-goods expansion, while `tablet`, `headphones`, and `speakers` remain enabled experimental categories that share the same workflow surface.
66

77
## What problem this solves
88

@@ -76,7 +76,7 @@ The solution now contains ten projects:
7676

7777
## Architecture at a glance
7878

79-
1. Operators register or enable managed crawl sources and assign categories such as `tv`, `monitor`, and `laptop`.
79+
1. Operators register or enable managed crawl sources and assign categories such as `tv`, `monitor`, `laptop`, and `smartphone`.
8080
2. Each source carries a discovery profile with category entry pages, sitemap hints, allow or deny path rules, URL patterns, depth limits, and per-run budgets.
8181
3. A category crawl job now seeds deterministic discovery from eligible managed sources instead of relying only on pre-known targets.
8282
4. The discovery worker fetches sitemaps and listing pages while respecting robots rules, source throttling, depth limits, and URL budgets.
@@ -93,8 +93,8 @@ The solution now contains ten projects:
9393
The solution currently includes:
9494

9595
- category metadata and schema discovery for electrical-goods families
96-
- category registry support for the Milestone 1 rollout set: TVs, Monitors, and Laptops
97-
- schema-driven attribute normalisation with category-specific providers for TVs, Monitors, and Laptops
96+
- category registry support for the current supported set: TVs, Monitors, Laptops, and Smartphones
97+
- schema-driven attribute normalisation with category-specific providers for TVs, Monitors, Laptops, Smartphones, and enabled experimental next-wave categories
9898
- alias handling and measurement parsing
9999
- structured data extraction from HTML and JSON-LD
100100
- MongoDB persistence for source and canonical records

0 commit comments

Comments
 (0)