From 9b03b5e52cb1ab7690cc041ed93dc290e91f1dff Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Tue, 17 Feb 2026 21:11:53 +0100 Subject: [PATCH 01/15] Migrate to Elastic.Ingest.Elasticsearch 0.19.0 with source-generated mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace manual channel orchestration with IncrementalSyncOrchestrator and source-generated ElasticsearchTypeContext from Elastic.Mapping 0.4.0. Add field type attributes ([Keyword], [Text], [Object], etc.) directly on DocumentationDocument to drive the mapping source generator, replacing verbose manual JSON mappings. - Update Elastic.Ingest.Elasticsearch 0.17.1 → 0.19.0, add Elastic.Mapping 0.4.0 - Add mapping attributes to DocumentationDocument and IndexedProduct - Create DocumentationMappingConfig.cs with two Entity variants (lexical/semantic) - Rewrite ElasticsearchMarkdownExporter to use orchestrator for dual-index mode - Delete ElasticsearchIngestChannel.cs and ElasticsearchIngestChannel.Mapping.cs - Remove unused ReindexAsync from ElasticsearchOperations - Update SearchBootstrapFixture to use IngestChannel with semantic type context --- Directory.Packages.props | 3 +- .../Elastic.Documentation.csproj | 1 + .../Search/DocumentationDocument.cs | 21 + .../Search/IndexedProduct.cs | 3 + .../DocumentationMappingConfig.cs | 199 ++++++++ .../ElasticsearchIngestChannel.Mapping.cs | 260 ---------- .../ElasticsearchIngestChannel.cs | 161 ------ .../ElasticsearchMarkdownExporter.Export.cs | 26 +- .../ElasticsearchMarkdownExporter.cs | 478 +++++++----------- .../Elasticsearch/ElasticsearchOperations.cs | 16 - .../Search/SearchBootstrapFixture.cs | 26 +- 11 files changed, 435 insertions(+), 759 deletions(-) create mode 100644 src/Elastic.Markdown/Exporters/Elasticsearch/DocumentationMappingConfig.cs delete mode 100644 src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchIngestChannel.Mapping.cs delete mode 100644 src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchIngestChannel.cs diff --git a/Directory.Packages.props b/Directory.Packages.props index 2adc4ebc0..04966e308 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -48,7 +48,8 @@ - + + diff --git a/src/Elastic.Documentation/Elastic.Documentation.csproj b/src/Elastic.Documentation/Elastic.Documentation.csproj index 99b59c073..fbc2f8c72 100644 --- a/src/Elastic.Documentation/Elastic.Documentation.csproj +++ b/src/Elastic.Documentation/Elastic.Documentation.csproj @@ -9,6 +9,7 @@ + diff --git a/src/Elastic.Documentation/Search/DocumentationDocument.cs b/src/Elastic.Documentation/Search/DocumentationDocument.cs index e30a4b350..e25ded0ab 100644 --- a/src/Elastic.Documentation/Search/DocumentationDocument.cs +++ b/src/Elastic.Documentation/Search/DocumentationDocument.cs @@ -4,6 +4,7 @@ using System.Text.Json.Serialization; using Elastic.Documentation.AppliesTo; +using Elastic.Mapping; namespace Elastic.Documentation.Search; @@ -12,6 +13,7 @@ public record ParentDocument [JsonPropertyName("title")] public required string Title { get; set; } + [Keyword] [JsonPropertyName("url")] public required string Url { get; set; } } @@ -28,6 +30,7 @@ public record DocumentationDocument [JsonPropertyName("search_title")] public required string SearchTitle { get; set; } + [Keyword(Normalizer = "keyword_normalizer")] [JsonPropertyName("type")] public required string Type { get; set; } = "doc"; @@ -35,6 +38,7 @@ public record DocumentationDocument /// The canonical/primary product for this document (nested object with id and repository). /// Name and version are looked up dynamically by product id. /// + [Object] [JsonPropertyName("product")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public IndexedProduct? Product { get; set; } @@ -42,13 +46,18 @@ public record DocumentationDocument /// /// All related products found during inference (from legacy mappings, applicability, etc.) /// + [Object] [JsonPropertyName("related_products")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public IndexedProduct[]? RelatedProducts { get; set; } + [Id] + [Keyword] [JsonPropertyName("url")] public required string Url { get; set; } = string.Empty; + [ContentHash] + [Keyword] [JsonPropertyName("hash")] public string Hash { get; set; } = string.Empty; @@ -58,6 +67,7 @@ public record DocumentationDocument [JsonPropertyName("navigation_table_of_contents")] public int NavigationTableOfContents { get; set; } = 50; //default to a high number so that omission gets penalized. + [Keyword(Normalizer = "keyword_normalizer")] [JsonPropertyName("navigation_section")] public string? NavigationSection { get; set; } @@ -67,18 +77,21 @@ public record DocumentationDocument public DateTimeOffset BatchIndexDate { get; set; } /// The date this document was last updated, + [Timestamp] [JsonPropertyName("last_updated")] public DateTimeOffset LastUpdated { get; set; } [JsonPropertyName("description")] public string? Description { get; set; } + [Text] [JsonPropertyName("headings")] public string[] Headings { get; set; } = []; [JsonPropertyName("links")] public string[] Links { get; set; } = []; + [Nested] [JsonPropertyName("applies_to")] public ApplicableTo? Applies { get; set; } @@ -92,6 +105,7 @@ public record DocumentationDocument [JsonPropertyName("abstract")] public string? Abstract { get; set; } + [Object] [JsonPropertyName("parents")] public ParentDocument[] Parents { get; set; } = []; @@ -105,6 +119,7 @@ public record DocumentationDocument /// Key for enrichment cache lookups. Derived from normalized content + prompt hash. /// Used by enrich processor to join AI-generated fields at index time. /// + [Keyword] [JsonPropertyName("enrichment_key")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string? EnrichmentKey { get; set; } @@ -112,6 +127,7 @@ public record DocumentationDocument /// /// 3-5 sentences dense with technical entities, API names, and core functionality for vector matching. /// + [Text] [JsonPropertyName("ai_rag_optimized_summary")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string? AiRagOptimizedSummary { get; set; } @@ -119,6 +135,7 @@ public record DocumentationDocument /// /// Exactly 5-10 words for a UI tooltip. /// + [Text] [JsonPropertyName("ai_short_summary")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string? AiShortSummary { get; set; } @@ -126,6 +143,7 @@ public record DocumentationDocument /// /// A 3-8 word keyword string representing a high-intent user search for this doc. /// + [Keyword] [JsonPropertyName("ai_search_query")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string? AiSearchQuery { get; set; } @@ -133,6 +151,7 @@ public record DocumentationDocument /// /// Array of 3-5 specific questions answered by this document. /// + [Text] [JsonPropertyName("ai_questions")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string[]? AiQuestions { get; set; } @@ -140,6 +159,7 @@ public record DocumentationDocument /// /// Array of 2-4 specific use cases this doc helps with. /// + [Text] [JsonPropertyName("ai_use_cases")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string[]? AiUseCases { get; set; } @@ -148,6 +168,7 @@ public record DocumentationDocument /// Hash of the LLM prompt templates used to generate AI fields. /// Used to detect stale enrichments when prompts change. /// + [Keyword] [JsonPropertyName("enrichment_prompt_hash")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string? EnrichmentPromptHash { get; set; } diff --git a/src/Elastic.Documentation/Search/IndexedProduct.cs b/src/Elastic.Documentation/Search/IndexedProduct.cs index ee766fac1..cdb8925e8 100644 --- a/src/Elastic.Documentation/Search/IndexedProduct.cs +++ b/src/Elastic.Documentation/Search/IndexedProduct.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information using System.Text.Json.Serialization; +using Elastic.Mapping; namespace Elastic.Documentation.Search; @@ -15,12 +16,14 @@ public record IndexedProduct /// /// The product ID from products.yml (e.g., "elasticsearch", "kibana", "apm-agent-java") /// + [Keyword(Normalizer = "keyword_normalizer")] [JsonPropertyName("id")] public string? Id { get; init; } /// /// The repository name (e.g., "elasticsearch", "docs-content", "elastic-otel-java") /// + [Keyword(Normalizer = "keyword_normalizer")] [JsonPropertyName("repository")] public string? Repository { get; init; } } diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/DocumentationMappingConfig.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/DocumentationMappingConfig.cs new file mode 100644 index 000000000..9ae1c7072 --- /dev/null +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/DocumentationMappingConfig.cs @@ -0,0 +1,199 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using Elastic.Documentation.Search; +using Elastic.Ingest.Elasticsearch.Indices; +using Elastic.Mapping; +using Elastic.Mapping.Analysis; + +namespace Elastic.Markdown.Exporters.Elasticsearch; + +[ElasticsearchMappingContext] +[Entity( + Target = EntityTarget.Index, + Name = "docs-lexical", + WriteAlias = "docs-lexical", + ReadAlias = "docs-lexical", + SearchPattern = "docs-lexical-*", + DatePattern = "yyyy.MM.dd.HHmmss", + Configuration = typeof(LexicalConfig) +)] +[Entity( + Target = EntityTarget.Index, + Name = "docs-semantic", + Variant = "Semantic", + WriteAlias = "docs-semantic", + ReadAlias = "docs-semantic", + SearchPattern = "docs-semantic-*", + DatePattern = "yyyy.MM.dd.HHmmss", + Configuration = typeof(SemanticConfig) +)] +public static partial class DocumentationMappingContext; + +public static class LexicalConfig +{ + public static AnalysisBuilder ConfigureAnalysis(AnalysisBuilder analysis) => analysis; + + public static DocumentationDocumentMappingsBuilder ConfigureMappings(DocumentationDocumentMappingsBuilder m) => + ConfigureCommonMappings(m); + + internal static DocumentationDocumentMappingsBuilder ConfigureCommonMappings(DocumentationDocumentMappingsBuilder m) => m + // Text fields with custom analyzers and multi-fields + .SearchTitle(f => f + .Analyzer("synonyms_fixed_analyzer") + .SearchAnalyzer("synonyms_analyzer") + .MultiField("completion", mf => mf.SearchAsYouType() + .Analyzer("synonyms_fixed_analyzer") + .SearchAnalyzer("synonyms_analyzer"))) + .Title(f => f + .SearchAnalyzer("synonyms_analyzer") + .MultiField("keyword", mf => mf.Keyword().Normalizer("keyword_normalizer")) + .MultiField("starts_with", mf => mf.Text() + .Analyzer("starts_with_analyzer") + .SearchAnalyzer("starts_with_analyzer_search")) + .MultiField("completion", mf => mf.SearchAsYouType().SearchAnalyzer("synonyms_analyzer"))) + .StrippedBody(f => f + .Analyzer("synonyms_fixed_analyzer") + .SearchAnalyzer("synonyms_analyzer")) + .Abstract(f => f + .Analyzer("synonyms_fixed_analyzer") + .SearchAnalyzer("synonyms_analyzer")) + .Headings(f => f + .Analyzer("synonyms_fixed_analyzer") + .SearchAnalyzer("synonyms_analyzer")) + // JsonIgnore fields — [Text]/[Keyword] attributes handle the type, + // AddField only needed when custom analyzers are required + .AddField("ai_rag_optimized_summary", f => f.Text() + .Analyzer("synonyms_fixed_analyzer") + .SearchAnalyzer("synonyms_analyzer")) + // Keyword fields with multi-fields + .Url(f => f + .MultiField("match", mf => mf.Text()) + .MultiField("prefix", mf => mf.Text().Analyzer("hierarchy_analyzer"))) + // Rank features — no attribute available, must use AddField + .AddField("navigation_depth", f => f.RankFeature().PositiveScoreImpact(false)) + .AddField("navigation_table_of_contents", f => f.RankFeature().PositiveScoreImpact(false)) + // Nested applies_to — sub-fields don't match C# structure (custom JsonConverter) + .AddField("applies_to.type", f => f.Keyword().Normalizer("keyword_normalizer")) + .AddField("applies_to.sub-type", f => f.Keyword().Normalizer("keyword_normalizer")) + .AddField("applies_to.lifecycle", f => f.Keyword().Normalizer("keyword_normalizer")) + .AddField("applies_to.version", f => f.Version()) + // Parent document multi-fields + .AddField("parents.url", f => f.Keyword() + .MultiField("match", mf => mf.Text()) + .MultiField("prefix", mf => mf.Text().Analyzer("hierarchy_analyzer"))) + .AddField("parents.title", f => f.Text() + .SearchAnalyzer("synonyms_analyzer") + .MultiField("keyword", mf => mf.Keyword())); +} + +public static class SemanticConfig +{ + private const string InferenceId = ".elser-2-elastic"; + + public static AnalysisBuilder ConfigureAnalysis(AnalysisBuilder analysis) => analysis; + + public static DocumentationDocumentMappingsBuilder ConfigureMappings(DocumentationDocumentMappingsBuilder m) => + LexicalConfig.ConfigureCommonMappings(m) + .AddField("title.semantic_text", f => f.SemanticText().InferenceId(InferenceId)) + .AddField("abstract.semantic_text", f => f.SemanticText().InferenceId(InferenceId)) + .AddField("ai_rag_optimized_summary.semantic_text", f => f.SemanticText().InferenceId(InferenceId)) + .AddField("ai_questions.semantic_text", f => f.SemanticText().InferenceId(InferenceId)) + .AddField("ai_use_cases.semantic_text", f => f.SemanticText().InferenceId(InferenceId)); +} + +/// +/// Builds analysis settings at runtime (includes synonyms that are loaded from configuration). +/// +public static class DocumentationAnalysisFactory +{ + public static AnalysisBuilder BuildAnalysis(AnalysisBuilder analysis, string synonymSetName, string[] indexTimeSynonyms) => analysis + .Normalizer("keyword_normalizer", n => n.Custom() + .CharFilter("strip_non_word_chars") + .Filters("lowercase", "asciifolding", "trim")) + .Analyzer("starts_with_analyzer", a => a.Custom() + .Tokenizer("starts_with_tokenizer") + .Filter("lowercase")) + .Analyzer("starts_with_analyzer_search", a => a.Custom() + .Tokenizer("keyword") + .Filter("lowercase")) + .Analyzer("synonyms_fixed_analyzer", a => a.Custom() + .Tokenizer("group_tokenizer") + .Filters("lowercase", "synonyms_fixed_filter", "kstem")) + .Analyzer("synonyms_analyzer", a => a.Custom() + .Tokenizer("group_tokenizer") + .Filters("lowercase", "synonyms_filter", "kstem")) + .Analyzer("highlight_analyzer", a => a.Custom() + .Tokenizer("group_tokenizer") + .Filters("lowercase", "english_stop")) + .Analyzer("hierarchy_analyzer", a => a.Custom() + .Tokenizer("path_tokenizer")) + .CharFilter("strip_non_word_chars", cf => cf.PatternReplace() + .Pattern(@"\W") + .Replacement(" ")) + .TokenFilter("synonyms_fixed_filter", tf => tf.SynonymGraph() + .Synonyms(indexTimeSynonyms)) + .TokenFilter("synonyms_filter", tf => tf.SynonymGraph() + .SynonymsSet(synonymSetName) + .Updateable(true)) + .TokenFilter("english_stop", tf => tf.Stop() + .Stopwords("_english_")) + .Tokenizer("starts_with_tokenizer", t => t.EdgeNGram() + .MinGram(1) + .MaxGram(10) + .TokenChars("letter", "digit", "symbol", "whitespace")) + .Tokenizer("group_tokenizer", t => t.CharGroup() + .TokenizeOnChars("whitespace", ",", ";", "?", "!", "(", ")", "&", "'", "\"", "/", "[", "]", "{", "}")) + .Tokenizer("path_tokenizer", t => t.PathHierarchy() + .Delimiter('/')); + + /// + /// Creates the index settings JSON with analysis configuration and optional default pipeline. + /// + public static string BuildSettingsJson(string synonymSetName, string[] indexTimeSynonyms, string? defaultPipeline = null) + { + var analysis = BuildAnalysis(new AnalysisBuilder(), synonymSetName, indexTimeSynonyms); + var analysisJson = analysis.Build().ToJsonString(); + + if (defaultPipeline is not null) + { + // Merge default_pipeline into the settings JSON + return $$""" + { + "default_pipeline": "{{defaultPipeline}}", + "analysis": {{analysisJson}} + } + """; + } + + return $$""" + { + "analysis": {{analysisJson}} + } + """; + } + + /// + /// Creates an ElasticsearchTypeContext with runtime analysis settings and dynamic index name. + /// + public static ElasticsearchTypeContext CreateContext( + ElasticsearchTypeContext baseContext, + string indexName, + string synonymSetName, + string[] indexTimeSynonyms, + string? defaultPipeline = null) + { + var settingsJson = BuildSettingsJson(synonymSetName, indexTimeSynonyms, defaultPipeline); + var settingsHash = HashedBulkUpdate.CreateHash(settingsJson); + var hash = HashedBulkUpdate.CreateHash(settingsHash, baseContext.MappingsHash); + + return baseContext.WithIndexName(indexName) with + { + GetSettingsJson = () => settingsJson, + SettingsHash = settingsHash, + Hash = hash, + ConfigureAnalysis = a => BuildAnalysis(a, synonymSetName, indexTimeSynonyms) + }; + } +} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchIngestChannel.Mapping.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchIngestChannel.Mapping.cs deleted file mode 100644 index 4e36f7a56..000000000 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchIngestChannel.Mapping.cs +++ /dev/null @@ -1,260 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -using Elastic.Documentation.Search; -using Elastic.Ingest.Elasticsearch.Catalog; - -namespace Elastic.Markdown.Exporters.Elasticsearch; - -public abstract partial class ElasticsearchIngestChannel - where TChannelOptions : CatalogIndexChannelOptionsBase - where TChannel : CatalogIndexChannel -{ - protected static string CreateMappingSetting(string synonymSetName, string[] synonyms, string? defaultPipeline = null) - { - var indexTimeSynonyms = $"[{string.Join(",", synonyms.Select(r => $"\"{r}\""))}]"; - var pipelineSetting = defaultPipeline is not null ? $"\"default_pipeline\": \"{defaultPipeline}\"," : ""; - // language=json - return - $$$""" - { - {{{pipelineSetting}}} - "analysis": { - "normalizer": { - "keyword_normalizer": { - "type": "custom", - "char_filter": ["strip_non_word_chars"], - "filter": ["lowercase", "asciifolding", "trim"] - } - }, - "analyzer": { - "starts_with_analyzer": { - "tokenizer": "starts_with_tokenizer", - "filter": [ "lowercase" ] - }, - "starts_with_analyzer_search": { - "tokenizer": "keyword", - "filter": [ "lowercase" ] - }, - "synonyms_fixed_analyzer": { - "tokenizer": "group_tokenizer", - "filter": [ - "lowercase", - "synonyms_fixed_filter", - "kstem" - ] - }, - "synonyms_analyzer": { - "tokenizer": "group_tokenizer", - "filter": [ - "lowercase", - "synonyms_filter", - "kstem" - ] - }, - "highlight_analyzer": { - "tokenizer": "group_tokenizer", - "filter": [ - "lowercase", - "english_stop" - ] - }, - "hierarchy_analyzer": { "tokenizer": "path_tokenizer" } - }, - "char_filter": { - "strip_non_word_chars": { - "type": "pattern_replace", - "pattern": "\\W", - "replacement": " " - } - }, - "filter": { - "synonyms_fixed_filter": { - "type": "synonym_graph", - "synonyms": {{{indexTimeSynonyms}}} - }, - "synonyms_filter": { - "type": "synonym_graph", - "synonyms_set": "{{{synonymSetName}}}", - "updateable": true - }, - "english_stop": { - "type": "stop", - "stopwords": "_english_" - } - }, - "tokenizer": { - "starts_with_tokenizer": { - "type": "edge_ngram", - "min_gram": 1, - "max_gram": 10, - "token_chars": [ - "letter", - "digit", - "symbol", - "whitespace" - ] - }, - "group_tokenizer": { - "type": "char_group", - "tokenize_on_chars": [ "whitespace", ",", ";", "?", "!", "(", ")", "&", "'", "\"", "/", "[", "]", "{", "}" ] - }, - "path_tokenizer": { - "type": "path_hierarchy", - "delimiter": "/" - } - } - } - } - """; - } - - // language=json - protected static string CreateMapping(string? inferenceId) => - $$""" - { - "properties": { - "type": { "type" : "keyword", "normalizer": "keyword_normalizer" }, - "product": { - "type": "object", - "properties": { - "id": { "type": "keyword", "normalizer": "keyword_normalizer" }, - "repository": { "type": "keyword", "normalizer": "keyword_normalizer" } - } - }, - "related_products": { - "type": "object", - "properties": { - "id": { "type": "keyword", "normalizer": "keyword_normalizer" }, - "repository": { "type": "keyword", "normalizer": "keyword_normalizer" } - } - }, - "url": { - "type": "keyword", - "fields": { - "match": { "type": "text" }, - "prefix": { "type": "text", "analyzer" : "hierarchy_analyzer" } - } - }, - "navigation_depth" : { "type" : "rank_feature", "positive_score_impact": false }, - "navigation_table_of_contents" : { "type" : "rank_feature", "positive_score_impact": false }, - "navigation_section" : { "type" : "keyword", "normalizer": "keyword_normalizer" }, - "hidden" : { - "type" : "boolean" - }, - "applies_to" : { - "type" : "nested", - "properties" : { - "type" : { "type" : "keyword", "normalizer": "keyword_normalizer" }, - "sub-type" : { "type" : "keyword", "normalizer": "keyword_normalizer" }, - "lifecycle" : { "type" : "keyword", "normalizer": "keyword_normalizer" }, - "version" : { "type" : "version" } - } - }, - "parents" : { - "type" : "object", - "properties" : { - "url" : { - "type": "keyword", - "fields": { - "match": { "type": "text" }, - "prefix": { "type": "text", "analyzer" : "hierarchy_analyzer" } - } - }, - "title": { - "type": "text", - "search_analyzer": "synonyms_analyzer", - "fields": { - "keyword": { "type": "keyword" } - } - } - } - }, - "hash" : { "type" : "keyword" }, - "enrichment_key" : { "type" : "keyword" }, - "search_title": { - "type": "text", - "analyzer": "synonyms_fixed_analyzer", - "search_analyzer": "synonyms_analyzer", - "fields": { - "completion": { - "type": "search_as_you_type", - "analyzer": "synonyms_fixed_analyzer", - "search_analyzer": "synonyms_analyzer", - "term_vector": "with_positions_offsets", - "index_options": "offsets" - } - } - }, - "title": { - "type": "text", - "search_analyzer": "synonyms_analyzer", - "fields": { - "keyword": { "type": "keyword", "normalizer": "keyword_normalizer" }, - "starts_with": { "type": "text", "analyzer": "starts_with_analyzer", "search_analyzer": "starts_with_analyzer_search" }, - "completion": { "type": "search_as_you_type", "search_analyzer": "synonyms_analyzer" } - {{(!string.IsNullOrWhiteSpace(inferenceId) ? $$""", "semantic_text": {{{InferenceMapping(inferenceId)}}}""" : "")}} - } - }, - "body": { - "type": "text" - }, - "stripped_body": { - "type": "text", - "analyzer": "synonyms_fixed_analyzer", - "search_analyzer": "synonyms_analyzer", - "term_vector": "with_positions_offsets" - }, - "headings": { - "type": "text", - "analyzer": "synonyms_fixed_analyzer", - "search_analyzer": "synonyms_analyzer" - }, - "abstract": { - "type" : "text", - "analyzer": "synonyms_fixed_analyzer", - "search_analyzer": "synonyms_analyzer", - "fields" : { - {{(!string.IsNullOrWhiteSpace(inferenceId) ? $"\"semantic_text\": {{{InferenceMapping(inferenceId)}}}" : "")}} - } - }, - "ai_rag_optimized_summary": { - "type": "text", - "analyzer": "synonyms_fixed_analyzer", - "search_analyzer": "synonyms_analyzer", - "fields": { - {{(!string.IsNullOrWhiteSpace(inferenceId) ? $"\"semantic_text\": {{{InferenceMapping(inferenceId)}}}" : "")}} - } - }, - "ai_short_summary": { - "type": "text" - }, - "ai_search_query": { - "type": "keyword" - }, - "ai_questions": { - "type": "text", - "fields": { - {{(!string.IsNullOrWhiteSpace(inferenceId) ? $"\"semantic_text\": {{{InferenceMapping(inferenceId)}}}" : "")}} - } - }, - "ai_use_cases": { - "type": "text", - "fields": { - {{(!string.IsNullOrWhiteSpace(inferenceId) ? $"\"semantic_text\": {{{InferenceMapping(inferenceId)}}}" : "")}} - } - }, - "enrichment_prompt_hash": { - "type": "keyword" - } - } - } - """; - - private static string InferenceMapping(string inferenceId) => - $""" - "type": "semantic_text", - "inference_id": "{inferenceId}" - """; -} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchIngestChannel.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchIngestChannel.cs deleted file mode 100644 index 6ff857956..000000000 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchIngestChannel.cs +++ /dev/null @@ -1,161 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -using Elastic.Channels; -using Elastic.Documentation.Configuration; -using Elastic.Documentation.Diagnostics; -using Elastic.Documentation.Search; -using Elastic.Documentation.Serialization; -using Elastic.Ingest.Elasticsearch.Catalog; -using Elastic.Ingest.Elasticsearch.Indices; -using Elastic.Ingest.Elasticsearch.Semantic; -using Elastic.Transport; -using Microsoft.Extensions.Logging; - -namespace Elastic.Markdown.Exporters.Elasticsearch; - -public class ElasticsearchLexicalIngestChannel( - ILoggerFactory logFactory, - IDiagnosticsCollector collector, - ElasticsearchEndpoint endpoint, - string indexNamespace, - DistributedTransport transport, - string[] indexTimeSynonyms, - string? defaultPipeline = null -) - : ElasticsearchIngestChannel, CatalogIndexChannel> - (logFactory, collector, endpoint, transport, o => new(o), t => new(t) - { - BulkOperationIdLookup = d => d.Url, - // hash, last_updated and batch_index_date are all set before the docs are written to the channel - ScriptedHashBulkUpsertLookup = (d, _) => new HashedBulkUpdate("hash", d.Hash, "ctx._source.batch_index_date = params.batch_index_date", - new Dictionary - { - { "batch_index_date", d.BatchIndexDate.ToString("o") } - }), - GetMapping = () => CreateMapping(null), - GetMappingSettings = () => CreateMappingSetting($"docs-{indexNamespace}", indexTimeSynonyms, defaultPipeline), - IndexFormat = - $"{endpoint.IndexNamePrefix.Replace("semantic", "lexical").ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}-{{0:yyyy.MM.dd.HHmmss}}", - ActiveSearchAlias = $"{endpoint.IndexNamePrefix.Replace("semantic", "lexical").ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}" - }); - -public class ElasticsearchSemanticIngestChannel( - ILoggerFactory logFactory, - IDiagnosticsCollector collector, - ElasticsearchEndpoint endpoint, - string indexNamespace, - DistributedTransport transport, - string[] indexTimeSynonyms, - string? defaultPipeline = null -) - : ElasticsearchIngestChannel, SemanticIndexChannel> - (logFactory, collector, endpoint, transport, o => new(o), t => new(t) - { - BulkOperationIdLookup = d => d.Url, - GetMapping = (inferenceId, _) => CreateMapping(inferenceId), - GetMappingSettings = (_, _) => CreateMappingSetting($"docs-{indexNamespace}", indexTimeSynonyms, defaultPipeline), - IndexFormat = $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}-{{0:yyyy.MM.dd.HHmmss}}", - ActiveSearchAlias = $"{endpoint.IndexNamePrefix}-{indexNamespace.ToLowerInvariant()}", - IndexNumThreads = endpoint.IndexNumThreads, - SearchNumThreads = endpoint.SearchNumThreads, - InferenceCreateTimeout = TimeSpan.FromMinutes(endpoint.BootstrapTimeout ?? 4), - UsePreexistingInferenceIds = !endpoint.NoElasticInferenceService, - InferenceId = endpoint.NoElasticInferenceService ? null : ".elser-2-elastic", - SearchInferenceId = endpoint.NoElasticInferenceService ? null : ".elser-2-elastic" - }); - -public abstract partial class ElasticsearchIngestChannel : IDisposable - where TChannelOptions : CatalogIndexChannelOptionsBase - where TChannel : CatalogIndexChannel -{ - private readonly IDiagnosticsCollector _collector; - public TChannel Channel { get; } - private readonly ILogger _logger; - - protected ElasticsearchIngestChannel( - ILoggerFactory logFactory, - IDiagnosticsCollector collector, - ElasticsearchEndpoint endpoint, - DistributedTransport transport, - Func createChannel, - Func createOptions - ) - { - _collector = collector; - _logger = logFactory.CreateLogger>(); - //The max num threads per allocated node, from testing its best to limit our max concurrency - //producing to this number as well - var options = createOptions(transport); - var i = 0; - options.BufferOptions = new BufferOptions - { - OutboundBufferMaxSize = endpoint.BufferSize, - ExportMaxConcurrency = endpoint.IndexNumThreads, - ExportMaxRetries = endpoint.MaxRetries - }; - options.SerializerContext = SourceGenerationContext.Default; - options.ExportBufferCallback = () => - { - var count = Interlocked.Increment(ref i); - _logger.LogInformation("Exported {Count} documents to Elasticsearch index {IndexName}", - count * endpoint.BufferSize, Channel?.IndexName ?? string.Format(options.IndexFormat, "latest")); - }; - options.ExportExceptionCallback = e => - { - _logger.LogError(e, "Failed to export document"); - _collector.EmitGlobalError("Elasticsearch export: failed to export document", e); - }; - options.ServerRejectionCallback = items => - { - foreach (var (doc, responseItem) in items) - { - _collector.EmitGlobalError( - $"Server rejection: {responseItem.Status} {responseItem.Error?.Type} {responseItem.Error?.Reason} for document {doc.Url}"); - } - }; - Channel = createChannel(options); - _logger.LogInformation("Created {Channel} Elasticsearch target for indexing", typeof(TChannel).Name); - } - - public async ValueTask StopAsync(Cancel ctx = default) - { - _logger.LogInformation("Waiting to drain all inflight exports to Elasticsearch"); - var drained = await Channel.WaitForDrainAsync(null, ctx); - if (!drained) - _collector.EmitGlobalError("Elasticsearch export: failed to complete indexing in a timely fashion while shutting down"); - - _logger.LogInformation("Refreshing target index {Index}", Channel.IndexName); - var refreshed = await Channel.RefreshAsync(ctx); - if (!refreshed) - _collector.EmitGlobalError($"Refreshing target index {Channel.IndexName} did not complete successfully"); - - _logger.LogInformation("Applying aliases to {Index}", Channel.IndexName); - var swapped = await Channel.ApplyAliasesAsync(ctx); - if (!swapped) - _collector.EmitGlobalError($"${nameof(ElasticsearchMarkdownExporter)} failed to apply aliases to index {Channel.IndexName}"); - - return drained && refreshed && swapped; - } - - public async ValueTask RefreshAsync(Cancel ctx = default) => await Channel.RefreshAsync(ctx); - - public async ValueTask TryWrite(DocumentationDocument document, Cancel ctx = default) - { - if (Channel.TryWrite(document)) - return true; - - if (await Channel.WaitToWriteAsync(ctx)) - return Channel.TryWrite(document); - return false; - } - - public void Dispose() - { - Channel.Complete(); - Channel.Dispose(); - - GC.SuppressFinalize(this); - } -} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs index 00f4d65a0..a4c2172d8 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs @@ -27,8 +27,8 @@ public partial class ElasticsearchMarkdownExporter /// private void AssignDocumentMetadata(DocumentationDocument doc) { - var semanticHash = _semanticChannel.Channel.ChannelHash; - var lexicalHash = _lexicalChannel.Channel.ChannelHash; + var semanticHash = _semanticTypeContext?.Hash ?? string.Empty; + var lexicalHash = _lexicalTypeContext.Hash; var hash = HashedBulkUpdate.CreateHash(semanticHash, lexicalHash, doc.Url, doc.Type, doc.StrippedBody ?? string.Empty, string.Join(",", doc.Headings.OrderBy(h => h)), doc.SearchTitle ?? string.Empty, @@ -165,9 +165,7 @@ public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, AssignDocumentMetadata(doc); - if (_indexStrategy == IngestStrategy.Multiplex) - return await _lexicalChannel.TryWrite(doc, ctx) && await _semanticChannel.TryWrite(doc, ctx); - return await _lexicalChannel.TryWrite(doc, ctx); + return await WriteDocumentAsync(doc, ctx); } /// @@ -209,22 +207,10 @@ public async ValueTask FinishExportAsync(IDirectoryInfo outputFolder, Canc AssignDocumentMetadata(doc); - // Write to channels following the multiplex or reindex strategy - if (_indexStrategy == IngestStrategy.Multiplex) + if (!await WriteDocumentAsync(doc, ctx)) { - if (!await _lexicalChannel.TryWrite(doc, ctx) || !await _semanticChannel.TryWrite(doc, ctx)) - { - _logger.LogError("Failed to write OpenAPI document {Url}", doc.Url); - return false; - } - } - else - { - if (!await _lexicalChannel.TryWrite(doc, ctx)) - { - _logger.LogError("Failed to write OpenAPI document {Url}", doc.Url); - return false; - } + _logger.LogError("Failed to write OpenAPI document {Url}", doc.Url); + return false; } } diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs index 5220bfe39..84c0e5c75 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs @@ -4,38 +4,42 @@ using System.Text.Json; using System.Text.Json.Serialization; +using Elastic.Channels; using Elastic.Documentation.Configuration; using Elastic.Documentation.Configuration.Search; using Elastic.Documentation.Configuration.Versions; using Elastic.Documentation.Diagnostics; +using Elastic.Documentation.Search; +using Elastic.Documentation.Serialization; using Elastic.Ingest.Elasticsearch; using Elastic.Ingest.Elasticsearch.Indices; +using Elastic.Mapping; using Elastic.Markdown.Exporters.Elasticsearch.Enrichment; using Elastic.Transport; using Microsoft.Extensions.Logging; -using NetEscapades.EnumGenerators; namespace Elastic.Markdown.Exporters.Elasticsearch; -[EnumExtensions] -public enum IngestStrategy { Reindex, Multiplex } - public partial class ElasticsearchMarkdownExporter : IMarkdownExporter, IDisposable { private readonly IDiagnosticsCollector _collector; private readonly IDocumentationConfigurationContext _context; private readonly ILogger _logger; - private readonly ElasticsearchLexicalIngestChannel _lexicalChannel; - private readonly ElasticsearchSemanticIngestChannel _semanticChannel; - private readonly ElasticsearchEndpoint _endpoint; - - private readonly DateTimeOffset _batchIndexDate = DateTimeOffset.UtcNow; private readonly DistributedTransport _transport; - private IngestStrategy _indexStrategy; private readonly string _indexNamespace; - private string _currentLexicalHash = string.Empty; - private string _currentSemanticHash = string.Empty; + private readonly DateTimeOffset _batchIndexDate; + + // Ingest: orchestrator for dual-index mode, plain channel for --no-semantic + private readonly IncrementalSyncOrchestrator? _orchestrator; + private readonly IngestChannel? _lexicalOnlyChannel; + + // Type context hashes for document content hash computation + private readonly ElasticsearchTypeContext _lexicalTypeContext; + private readonly ElasticsearchTypeContext? _semanticTypeContext; + + // Alias names for queries/statistics + private readonly string _lexicalAlias; private readonly IReadOnlyDictionary _synonyms; private readonly IReadOnlyCollection _rules; @@ -65,7 +69,6 @@ IDocumentationConfigurationContext context _context = context; _logger = logFactory.CreateLogger(); _endpoint = endpoints.Elasticsearch; - _indexStrategy = IngestStrategy.Reindex; _indexNamespace = indexNamespace; _versionsConfiguration = context.VersionsConfiguration; _synonyms = context.SearchConfiguration.Synonyms; @@ -73,6 +76,7 @@ IDocumentationConfigurationContext context var es = endpoints.Elasticsearch; _transport = ElasticsearchTransportFactory.Create(es); + _operations = new ElasticsearchOperations(_transport, _logger, collector); string[] fixedSynonyms = ["esql", "data-stream", "data-streams", "machine-learning"]; var indexTimeSynonyms = _synonyms.Aggregate(new List(), (acc, synonym) => @@ -83,15 +87,16 @@ IDocumentationConfigurationContext context }).Where(r => fixedSynonyms.Contains(r.Id)).Select(r => r.Synonyms).ToArray(); _fixedSynonymsHash = HashedBulkUpdate.CreateHash(string.Join(",", indexTimeSynonyms)); - // Use AI enrichment pipeline if enabled - hybrid approach: - // - Cache hits: enrich processor applies fields at index time - // - Cache misses: apply fields inline before indexing var aiPipeline = es.EnableAiEnrichment ? EnrichPolicyManager.PipelineName : null; - _lexicalChannel = new ElasticsearchLexicalIngestChannel(logFactory, collector, es, indexNamespace, _transport, indexTimeSynonyms, aiPipeline); - _semanticChannel = new ElasticsearchSemanticIngestChannel(logFactory, collector, es, indexNamespace, _transport, indexTimeSynonyms, aiPipeline); + var synonymSetName = $"docs-{indexNamespace}"; + var ns = indexNamespace.ToLowerInvariant(); + var lexicalPrefix = es.IndexNamePrefix.Replace("semantic", "lexical").ToLowerInvariant(); + _lexicalAlias = $"{lexicalPrefix}-{ns}"; - // Initialize shared ES operations - _operations = new ElasticsearchOperations(_transport, _logger, collector); + _lexicalTypeContext = DocumentationAnalysisFactory.CreateContext( + DocumentationMappingContext.DocumentationDocument.Context, + _lexicalAlias, synonymSetName, indexTimeSynonyms, aiPipeline + ); // Initialize AI enrichment services if enabled if (es.EnableAiEnrichment) @@ -100,76 +105,177 @@ IDocumentationConfigurationContext context _llmClient = new ElasticsearchLlmClient(_transport, logFactory.CreateLogger(), _operations); _enrichPolicyManager = new EnrichPolicyManager(_transport, logFactory.CreateLogger(), _enrichmentCache.IndexName); } + + if (!es.NoSemantic) + { + var semanticAlias = $"{es.IndexNamePrefix.ToLowerInvariant()}-{ns}"; + _semanticTypeContext = DocumentationAnalysisFactory.CreateContext( + DocumentationMappingContext.DocumentationDocumentSemantic.Context, + semanticAlias, synonymSetName, indexTimeSynonyms, aiPipeline + ); + + _orchestrator = new IncrementalSyncOrchestrator(_transport, _lexicalTypeContext, _semanticTypeContext) + { + ConfigurePrimary = ConfigureChannelOptions, + ConfigureSecondary = ConfigureChannelOptions, + OnPostComplete = es.EnableAiEnrichment + ? async (ctx, ct) => await PostCompleteAsync(ctx, ct) + : null + }; + _ = _orchestrator.AddPreBootstrapTask(async (_, ct) => + { + await InitializeEnrichmentAsync(ct); + await PublishSynonymsAsync(ct); + await PublishQueryRulesAsync(ct); + }); + + _batchIndexDate = _orchestrator.BatchTimestamp; + } + else + { + _batchIndexDate = DateTimeOffset.UtcNow; + var options = new IngestChannelOptions(_transport, _lexicalTypeContext); + ConfigureChannelOptions(options); + _lexicalOnlyChannel = new IngestChannel(options); + } + } + + private void ConfigureChannelOptions(IngestChannelOptions options) + { + options.BufferOptions = new BufferOptions + { + OutboundBufferMaxSize = _endpoint.BufferSize, + ExportMaxConcurrency = _endpoint.IndexNumThreads, + ExportMaxRetries = _endpoint.MaxRetries + }; + options.SerializerContext = SourceGenerationContext.Default; + options.ExportExceptionCallback = e => + { + _logger.LogError(e, "Failed to export document"); + _collector.EmitGlobalError("Elasticsearch export: failed to export document", e); + }; + options.ServerRejectionCallback = items => + { + foreach (var (doc, responseItem) in items) + { + _collector.EmitGlobalError( + $"Server rejection: {responseItem.Status} {responseItem.Error?.Type} {responseItem.Error?.Reason} for document {doc.Url}"); + } + }; } /// public async ValueTask StartAsync(Cancel ctx = default) { - // Initialize AI enrichment cache (pre-loads existing hashes into memory) - if (_enrichmentCache is not null && _enrichPolicyManager is not null) + if (_orchestrator is not null) { - _logger.LogInformation("Initializing AI enrichment cache..."); - await _enrichmentCache.InitializeAsync(ctx); - _logger.LogInformation("AI enrichment cache ready with {Count} existing entries", _enrichmentCache.Count); - - // The enrich pipeline must exist before indexing (used as default_pipeline). - // The pipeline's enrich processor requires the .enrich-* index to exist, - // which is created by executing the policy. We execute even with an empty - // cache index - it just creates an empty enrich index that returns no matches. - _logger.LogInformation("Setting up enrich policy and pipeline..."); - await _enrichPolicyManager.ExecutePolicyAsync(ctx); - await _enrichPolicyManager.EnsurePipelineExistsAsync(ctx); + _ = await _orchestrator.StartAsync(BootstrapMethod.Failure, ctx); + _logger.LogInformation("Orchestrator started with {Strategy} strategy", _orchestrator.Strategy); + return; } - _currentLexicalHash = await _lexicalChannel.Channel.GetIndexTemplateHashAsync(ctx) ?? string.Empty; - _currentSemanticHash = await _semanticChannel.Channel.GetIndexTemplateHashAsync(ctx) ?? string.Empty; - + // NoSemantic path + await InitializeEnrichmentAsync(ctx); await PublishSynonymsAsync(ctx); await PublishQueryRulesAsync(ctx); - _ = await _lexicalChannel.Channel.BootstrapElasticsearchAsync(BootstrapMethod.Failure, null, ctx); - - // if the previous hash does not match the current hash, we know already we want to multiplex to a new index - if (_currentLexicalHash != _lexicalChannel.Channel.ChannelHash) - _indexStrategy = IngestStrategy.Multiplex; + _ = await _lexicalOnlyChannel!.BootstrapElasticsearchAsync(BootstrapMethod.Failure, ctx); + } - if (!_endpoint.NoSemantic) + /// + public async ValueTask StopAsync(Cancel ctx = default) + { + if (_orchestrator is not null) { - var semanticWriteAlias = string.Format(_semanticChannel.Channel.Options.IndexFormat, "latest"); - var semanticIndexAvailable = await _transport.HeadAsync(semanticWriteAlias, ctx); - if (!semanticIndexAvailable.ApiCallDetails.HasSuccessfulStatusCode && _endpoint is { ForceReindex: false, NoSemantic: false }) + _ = await _orchestrator.CompleteAsync(null, ctx); + return; + } + + // NoSemantic path — drain, delete stale, refresh, alias + var drained = await _lexicalOnlyChannel!.WaitForDrainAsync(null, ctx); + if (!drained) + _collector.EmitGlobalError("Elasticsearch export: failed to drain in a timely fashion"); + + // Delete stale documents not part of this batch + var deleteQuery = PostData.String($$""" { - _indexStrategy = IngestStrategy.Multiplex; - _logger.LogInformation("Index strategy set to multiplex because {SemanticIndex} does not exist, pass --force-reindex to always use reindex", semanticWriteAlias); + "query": { + "range": { + "batch_index_date": { + "lt": "{{_batchIndexDate:o}}" + } + } + } } + """); + await _operations.DeleteByQueryAsync(_lexicalAlias, deleteQuery, ctx); - //try re-use index if we are re-indexing. Multiplex should always go to a new index - _semanticChannel.Channel.Options.TryReuseIndex = _indexStrategy == IngestStrategy.Reindex; - _ = await _semanticChannel.Channel.BootstrapElasticsearchAsync(BootstrapMethod.Failure, null, ctx); - } + _ = await _lexicalOnlyChannel.RefreshAsync(ctx); + _ = await _lexicalOnlyChannel.ApplyAliasesAsync(_lexicalAlias, ctx); + } - var lexicalIndexExists = await IndexExists(_lexicalChannel.Channel.IndexName) ? "existing" : "new"; - var semanticIndexExists = await IndexExists(_semanticChannel.Channel.IndexName) ? "existing" : "new"; - if (_currentLexicalHash != _lexicalChannel.Channel.ChannelHash) - { - _indexStrategy = IngestStrategy.Multiplex; - _logger.LogInformation("Multiplexing lexical new index: '{Index}' since current hash on server '{HashCurrent}' does not match new '{HashNew}'", - _lexicalChannel.Channel.IndexName, _currentLexicalHash, _lexicalChannel.Channel.ChannelHash); - } - else - _logger.LogInformation("Targeting {State} lexical: '{Index}'", lexicalIndexExists, _lexicalChannel.Channel.IndexName); + private async Task InitializeEnrichmentAsync(Cancel ctx) + { + if (_enrichmentCache is null || _enrichPolicyManager is null) + return; + + _logger.LogInformation("Initializing AI enrichment cache..."); + await _enrichmentCache.InitializeAsync(ctx); + _logger.LogInformation("AI enrichment cache ready with {Count} existing entries", _enrichmentCache.Count); + + _logger.LogInformation("Setting up enrich policy and pipeline..."); + await _enrichPolicyManager.ExecutePolicyAsync(ctx); + await _enrichPolicyManager.EnsurePipelineExistsAsync(ctx); + } + + private async Task PostCompleteAsync(OrchestratorContext context, Cancel ctx) => + await ExecuteEnrichPolicyIfNeededAsync(context.SecondaryWriteAlias, ctx); + + private async ValueTask ExecuteEnrichPolicyIfNeededAsync(string? semanticAlias, Cancel ctx) + { + if (_enrichmentCache is null || _enrichPolicyManager is null) + return; - if (!_endpoint.NoSemantic && _currentSemanticHash != _semanticChannel.Channel.ChannelHash) + _logger.LogInformation( + "AI enrichment complete: {CacheHits} cache hits, {Enrichments} enrichments generated (limit: {Limit})", + _cacheHitCount, _enrichmentCount, _enrichmentOptions.MaxNewEnrichmentsPerRun); + + if (_enrichmentCache.Count > 0) { - _indexStrategy = IngestStrategy.Multiplex; - _logger.LogInformation("Multiplexing new index '{Index}' since current hash on server '{HashCurrent}' does not match new '{HashNew}'", - _semanticChannel.Channel.IndexName, _currentSemanticHash, _semanticChannel.Channel.ChannelHash); + _logger.LogInformation("Executing enrich policy to update internal index with {Count} total entries...", _enrichmentCache.Count); + await _enrichPolicyManager.ExecutePolicyAsync(ctx); + + if (semanticAlias is not null) + await BackfillMissingAiFieldsAsync(semanticAlias, ctx); } - else if (!_endpoint.NoSemantic) - _logger.LogInformation("Targeting {State} semantical: '{Index}'", semanticIndexExists, _semanticChannel.Channel.IndexName); + } + + private async ValueTask BackfillMissingAiFieldsAsync(string semanticAlias, Cancel ctx) + { + if (_endpoint.NoSemantic || _enrichmentCache is null || _llmClient is null) + return; + + var currentPromptHash = ElasticsearchLlmClient.PromptHash; - _logger.LogInformation("Using {IndexStrategy} to sync lexical index to semantic index", _indexStrategy.ToStringFast(true)); + _logger.LogInformation( + "Starting AI backfill for documents missing or stale AI fields (cache has {CacheCount} entries, prompt hash: {PromptHash})", + _enrichmentCache.Count, currentPromptHash[..8]); - async ValueTask IndexExists(string name) => (await _transport.HeadAsync(name, ctx)).ApiCallDetails.HasSuccessfulStatusCode; + var query = $$""" + { + "query": { + "bool": { + "must": { "exists": { "field": "enrichment_key" } }, + "should": [ + { "bool": { "must_not": { "exists": { "field": "ai_questions" } } } }, + { "bool": { "must_not": { "term": { "enrichment_prompt_hash": "{{currentPromptHash}}" } } } } + ], + "minimum_should_match": 1 + } + } + } + """; + + await _operations.UpdateByQueryAsync(semanticAlias, PostData.String(query), EnrichPolicyManager.PipelineName, ctx); } private async Task PublishSynonymsAsync(Cancel ctx) @@ -246,236 +352,28 @@ private async Task PutQueryRuleset(QueryRuleset ruleset, string rulesetName, Can _logger.LogInformation("Successfully published query ruleset '{RulesetName}'.", rulesetName); } - private async ValueTask CountAsync(string index, string body, Cancel ctx = default) + internal async ValueTask WriteDocumentAsync(DocumentationDocument doc, Cancel ctx) { - var countResponse = await _operations.WithRetryAsync( - () => _transport.PostAsync($"/{index}/_count", PostData.String(body), ctx), - $"POST {index}/_count", - ctx); - return countResponse.Body.Get("count"); - } - - /// - public async ValueTask StopAsync(Cancel ctx = default) - { - var semanticWriteAlias = string.Format(_semanticChannel.Channel.Options.IndexFormat, "latest"); - var lexicalWriteAlias = string.Format(_lexicalChannel.Channel.Options.IndexFormat, "latest"); - - var stopped = await _lexicalChannel.StopAsync(ctx); - if (!stopped) - throw new Exception($"Failed to stop {_lexicalChannel.GetType().Name}"); - - await QueryIngestStatistics(lexicalWriteAlias, ctx); - - if (_indexStrategy == IngestStrategy.Multiplex) + if (_orchestrator is not null) { - if (!_endpoint.NoSemantic) - _ = await _semanticChannel.StopAsync(ctx); - - // cleanup lexical index of old data - await DoDeleteByQuery(lexicalWriteAlias, ctx); - // need to refresh the lexical index to ensure that the delete by query is available - _ = await _lexicalChannel.RefreshAsync(ctx); - await QueryDocumentCounts(ctx); - // ReSharper disable once ConvertIfStatementToConditionalTernaryExpression - if (_endpoint.NoSemantic) - _logger.LogInformation("Finish indexing {IndexStrategy} strategy", _indexStrategy.ToStringFast(true)); - else - _logger.LogInformation("Finish syncing to semantic in {IndexStrategy} strategy", _indexStrategy.ToStringFast(true)); - return; + if (_orchestrator.TryWrite(doc)) + return true; + _ = await _orchestrator.WaitToWriteAsync(doc, ctx); + return true; } - if (_endpoint.NoSemantic) - { - _logger.LogInformation("--no-semantic was specified so exiting early before reindexing to {Index}", lexicalWriteAlias); - return; - } - - var semanticIndex = _semanticChannel.Channel.IndexName; - // check if the alias exists - var semanticIndexHead = await _transport.HeadAsync(semanticWriteAlias, ctx); - if (!semanticIndexHead.ApiCallDetails.HasSuccessfulStatusCode) - { - _logger.LogInformation("No semantic index exists yet, creating index {Index} for semantic search", semanticIndex); - _ = await _semanticChannel.Channel.BootstrapElasticsearchAsync(BootstrapMethod.Failure, null, ctx); - var semanticIndexPut = await _transport.PutAsync(semanticIndex, PostData.String("{}"), ctx); - if (!semanticIndexPut.ApiCallDetails.HasSuccessfulStatusCode) - throw new Exception($"Failed to create index {semanticIndex}: {semanticIndexPut}"); - } - var destinationIndex = _semanticChannel.Channel.IndexName; - - _logger.LogInformation("_reindex updates: '{SourceIndex}' => '{DestinationIndex}'", lexicalWriteAlias, destinationIndex); - var request = PostData.String(@" - { - ""dest"": { - ""index"": """ + destinationIndex + @""" - }, - ""source"": { - ""index"": """ + lexicalWriteAlias + @""", - ""size"": 100, - ""query"": { - ""range"": { - ""last_updated"": { - ""gte"": """ + _batchIndexDate.ToString("o") + @""" - } - } - } - } - }"); - await DoReindex(request, lexicalWriteAlias, destinationIndex, "updates", ctx); - - _logger.LogInformation("_reindex deletions: '{SourceIndex}' => '{DestinationIndex}'", lexicalWriteAlias, destinationIndex); - request = PostData.String(@" - { - ""dest"": { - ""index"": """ + destinationIndex + @""" - }, - ""script"": { - ""source"": ""ctx.op = \""delete\"""" - }, - ""source"": { - ""index"": """ + lexicalWriteAlias + @""", - ""size"": 100, - ""query"": { - ""range"": { - ""batch_index_date"": { - ""lt"": """ + _batchIndexDate.ToString("o") + @""" - } - } - } - } - }"); - await DoReindex(request, lexicalWriteAlias, destinationIndex, "deletions", ctx); - - await DoDeleteByQuery(lexicalWriteAlias, ctx); - - _ = await _lexicalChannel.Channel.ApplyLatestAliasAsync(ctx); - _ = await _semanticChannel.Channel.ApplyAliasesAsync(ctx); - - _ = await _lexicalChannel.RefreshAsync(ctx); - _ = await _semanticChannel.RefreshAsync(ctx); - - _logger.LogInformation("Finish sync to semantic index using {IndexStrategy} strategy", _indexStrategy.ToStringFast(true)); - await QueryDocumentCounts(ctx); - - // Execute enrich policy so new cache entries are available for next run - await ExecuteEnrichPolicyIfNeededAsync(ctx); + if (_lexicalOnlyChannel!.TryWrite(doc)) + return true; + if (await _lexicalOnlyChannel.WaitToWriteAsync(ctx)) + return _lexicalOnlyChannel.TryWrite(doc); + return false; } - private async ValueTask ExecuteEnrichPolicyIfNeededAsync(Cancel ctx) - { - if (_enrichmentCache is null || _enrichPolicyManager is null) - return; - - _logger.LogInformation( - "AI enrichment complete: {CacheHits} cache hits, {Enrichments} enrichments generated (limit: {Limit})", - _cacheHitCount, _enrichmentCount, _enrichmentOptions.MaxNewEnrichmentsPerRun); - - if (_enrichmentCache.Count > 0) - { - _logger.LogInformation("Executing enrich policy to update internal index with {Count} total entries...", _enrichmentCache.Count); - await _enrichPolicyManager.ExecutePolicyAsync(ctx); - - // Backfill: Apply AI fields to documents that were skipped by hash-based upsert - await BackfillMissingAiFieldsAsync(ctx); - } - } - - private async ValueTask BackfillMissingAiFieldsAsync(Cancel ctx) - { - // Why backfill is needed: - // The exporter uses hash-based upsert - unchanged documents are skipped during indexing. - // These skipped documents never pass through the ingest pipeline, so they miss AI fields. - // This backfill runs _update_by_query with the AI pipeline to enrich those documents. - // - // Additionally, when prompts change, existing documents have stale AI fields. - // We detect this by checking if the document's prompt_hash differs from the current one. - // - // Only backfill the semantic index - it's what the search API uses. - // The lexical index is just an intermediate step for reindexing. - if (_endpoint.NoSemantic || _enrichmentCache is null || _llmClient is null) - return; - - var semanticAlias = _semanticChannel.Channel.Options.ActiveSearchAlias; - var currentPromptHash = ElasticsearchLlmClient.PromptHash; - - _logger.LogInformation( - "Starting AI backfill for documents missing or stale AI fields (cache has {CacheCount} entries, prompt hash: {PromptHash})", - _enrichmentCache.Count, currentPromptHash[..8]); - - // Find documents with enrichment_key that either: - // 1. Missing AI fields (never enriched), OR - // 2. Have stale/missing enrichment_prompt_hash (enriched with old prompts) - var query = $$""" - { - "query": { - "bool": { - "must": { "exists": { "field": "enrichment_key" } }, - "should": [ - { "bool": { "must_not": { "exists": { "field": "ai_questions" } } } }, - { "bool": { "must_not": { "term": { "enrichment_prompt_hash": "{{currentPromptHash}}" } } } } - ], - "minimum_should_match": 1 - } - } - } - """; - - await RunBackfillQuery(semanticAlias, query, ctx); - } - - private async ValueTask RunBackfillQuery(string indexAlias, string query, Cancel ctx) => - await _operations.UpdateByQueryAsync(indexAlias, PostData.String(query), EnrichPolicyManager.PipelineName, ctx); - - private async ValueTask QueryIngestStatistics(string lexicalWriteAlias, Cancel ctx) - { - var lexicalSearchAlias = _lexicalChannel.Channel.Options.ActiveSearchAlias; - var updated = await CountAsync(lexicalSearchAlias, $$""" { "query": { "range": { "last_updated": { "gte": "{{_batchIndexDate:o}}" } } } }""", ctx); - var total = await CountAsync(lexicalSearchAlias, $$""" { "query": { "range": { "batch_index_date": { "gte": "{{_batchIndexDate:o}}" } } } }""", ctx); - var deleted = await CountAsync(lexicalSearchAlias, $$""" { "query": { "range": { "batch_index_date": { "lt": "{{_batchIndexDate:o}}" } } } }""", ctx); - - // TODO emit these as metrics - _logger.LogInformation("Exported {Total}, Updated {Updated}, Deleted, {Deleted} documents to {LexicalIndex}", total, updated, deleted, lexicalWriteAlias); - _logger.LogInformation("Syncing to semantic index using {IndexStrategy} strategy", _indexStrategy.ToStringFast(true)); - } - - private async ValueTask QueryDocumentCounts(Cancel ctx) - { - var semanticWriteAlias = string.Format(_semanticChannel.Channel.Options.IndexFormat, "latest"); - var lexicalWriteAlias = string.Format(_lexicalChannel.Channel.Options.IndexFormat, "latest"); - var totalLexical = await CountAsync(lexicalWriteAlias, "{}", ctx); - var totalSemantic = await CountAsync(semanticWriteAlias, "{}", ctx); - - // TODO emit these as metrics - _logger.LogInformation("Document counts -> Semantic Index: {TotalSemantic}, Lexical Index: {TotalLexical}", totalSemantic, totalLexical); - } - - private async ValueTask DoDeleteByQuery(string lexicalWriteAlias, Cancel ctx) - { - // delete all documents with batch_index_date < _batchIndexDate - // they weren't part of the current export - _logger.LogInformation("Delete data in '{SourceIndex}' not part of batch date: {Date}", lexicalWriteAlias, _batchIndexDate.ToString("o")); - var query = PostData.String(@" - { - ""query"": { - ""range"": { - ""batch_index_date"": { - ""lt"": """ + _batchIndexDate.ToString("o") + @""" - } - } - } - }"); - await _operations.DeleteByQueryAsync(lexicalWriteAlias, query, ctx); - } - - private async ValueTask DoReindex(PostData request, string lexicalWriteAlias, string semanticWriteAlias, string typeOfSync, Cancel ctx) => - await _operations.ReindexAsync(request, lexicalWriteAlias, semanticWriteAlias, typeOfSync, ctx); - /// public void Dispose() { - _lexicalChannel.Dispose(); - _semanticChannel.Dispose(); + _orchestrator?.Dispose(); + _lexicalOnlyChannel?.Dispose(); _llmClient?.Dispose(); GC.SuppressFinalize(this); } diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchOperations.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchOperations.cs index 4f94ae14a..3a3952406 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchOperations.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchOperations.cs @@ -161,22 +161,6 @@ public async Task DeleteByQueryAsync( await PollTaskUntilCompleteAsync(taskId, "_delete_by_query", index, null, ct); } - /// - /// Executes a reindex operation and waits for completion. - /// - public async Task ReindexAsync( - PostData request, - string sourceIndex, - string destIndex, - string operationType, - CancellationToken ct) - { - var url = "/_reindex?wait_for_completion=false&scroll=10m"; - var taskId = await PostAsyncTaskAsync(url, request, $"POST _reindex ({operationType})", ct); - if (taskId is not null) - await PollTaskUntilCompleteAsync(taskId, $"_reindex {operationType}", sourceIndex, destIndex, ct); - } - /// /// Executes an update_by_query operation and waits for completion. /// diff --git a/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs b/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs index 18b096b6a..9e696c984 100644 --- a/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs +++ b/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs @@ -7,7 +7,10 @@ using Documentation.Builder.Diagnostics.Console; using Elastic.Documentation.Aspire; using Elastic.Documentation.Configuration; +using Elastic.Documentation.Search; using Elastic.Ingest.Elasticsearch; +using Elastic.Ingest.Elasticsearch.Indices; +using Elastic.Mapping; using Elastic.Markdown.Exporters.Elasticsearch; using Elastic.Transport; using Elastic.Transport.Products.Elasticsearch; @@ -175,22 +178,23 @@ private async ValueTask IsIndexingNeeded() var loggerFactory = fixture.DistributedApplication.Services.GetRequiredService(); var collector = new ConsoleDiagnosticsCollector(loggerFactory); - // Create semantic exporter to check channel hash (index namespace is 'dev' for tests) - using var semanticExporter = new ElasticsearchSemanticIngestChannel( - loggerFactory, - collector, - endpoint, - "dev", // index namespace - transport, + // Create semantic type context to check channel hash (index namespace is 'dev' for tests) + var semanticTypeContext = DocumentationAnalysisFactory.CreateContext( + DocumentationMappingContext.DocumentationDocumentSemantic.Context, + $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-dev", + "docs-dev", [] ); + var options = new IngestChannelOptions(transport, semanticTypeContext); + using var channel = new IngestChannel(options); + // Get the current hash from Elasticsearch index template - var currentSemanticHash = await semanticExporter.Channel.GetIndexTemplateHashAsync(TestContext.Current.CancellationToken) ?? string.Empty; + var currentSemanticHash = await channel.GetIndexTemplateHashAsync(TestContext.Current.CancellationToken) ?? string.Empty; - // Get the expected channel hash from the semantic exporter - await semanticExporter.Channel.BootstrapElasticsearchAsync(BootstrapMethod.Silent, ctx: TestContext.Current.CancellationToken); - var expectedSemanticHash = semanticExporter.Channel.ChannelHash; + // Get the expected channel hash + _ = await channel.BootstrapElasticsearchAsync(BootstrapMethod.Silent, TestContext.Current.CancellationToken); + var expectedSemanticHash = channel.ChannelHash; Console.WriteLine($"Elasticsearch semantic hash: '{currentSemanticHash}'"); Console.WriteLine($"Expected semantic hash: '{expectedSemanticHash}'"); From c9e0a1e3a58fb21406c0dddd09432e48a8492451 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Wed, 18 Feb 2026 11:18:01 +0100 Subject: [PATCH 02/15] Centralize Elasticsearch configuration into DocumentationEndpoints Replaces `ElasticsearchOptions` with `DocumentationEndpoints` as the single source of truth for Elasticsearch configuration across all API apps, MCP server, and integration tests. - Adds `IndexName` property to `ElasticsearchEndpoint` with a field-backed getter defaulting to `{IndexNamePrefix}-dev-latest`. - Creates `ElasticsearchEndpointFactory` in `ServiceDefaults` to centralize user-secrets and environment variable reading, eliminating the duplicated `72f50f33` secrets ID pattern. - Registers `DocumentationEndpoints` as a singleton in `AddDocumentationServiceDefaults`. - Updates `ElasticsearchClientAccessor` to accept `DocumentationEndpoints` instead of `ElasticsearchOptions`, supporting both API key and basic authentication. - Updates all gateway consumers (`NavigationSearchGateway`, `FullSearchGateway`, `DocumentGateway`, `ElasticsearchAskAiMessageFeedbackGateway`) to use endpoint properties. - Simplifies all three integration test files (`SearchRelevanceTests`, `McpToolsIntegrationTestsBase`, `SearchBootstrapFixture`) to use `ElasticsearchEndpointFactory` and `ElasticsearchTransportFactory`, removing manual config construction. - Deletes `ElasticsearchOptions.cs` and removes `Microsoft.Extensions.Configuration.UserSecrets` from the Search project. --- .../DocumentationEndpoints.cs | 6 ++ .../AppDefaultsExtensions.cs | 3 + ...astic.Documentation.ServiceDefaults.csproj | 1 + .../ElasticsearchEndpointFactory.cs | 66 +++++++++++++++++ .../Elastic.Documentation.Api.App/Program.cs | 13 ++-- ...lasticsearchAskAiMessageFeedbackGateway.cs | 15 ++-- .../Gateways/DocumentGateway.cs | 4 +- .../Program.cs | 12 ++-- .../Common/ElasticsearchClientAccessor.cs | 22 ++++-- .../Elastic.Documentation.Search.csproj | 1 - .../ElasticsearchOptions.cs | 39 ---------- .../FullSearchGateway.cs | 4 +- .../NavigationSearchGateway.cs | 6 +- .../ServicesExtension.cs | 2 - .../Search/SearchBootstrapFixture.cs | 72 ++----------------- .../Mcp.Remote.IntegrationTests.csproj | 1 + .../McpToolsIntegrationTestsBase.cs | 62 ++++------------ .../Search.IntegrationTests.csproj | 1 + .../SearchRelevanceTests.cs | 37 ++-------- 19 files changed, 150 insertions(+), 217 deletions(-) create mode 100644 src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs delete mode 100644 src/services/Elastic.Documentation.Search/ElasticsearchOptions.cs diff --git a/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs b/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs index 367fe844b..2bec0b94b 100644 --- a/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs +++ b/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs @@ -28,6 +28,12 @@ public class ElasticsearchEndpoint // index options public string IndexNamePrefix { get; set; } = "semantic-docs"; + public string IndexName + { + get => field ?? $"{IndexNamePrefix}-dev-latest"; + set; + } + // channel buffer options public int BufferSize { get; set; } = 50; // Reduced for Serverless rate limits public int MaxRetries { get; set; } = 5; // Increased for 429 retries diff --git a/src/Elastic.Documentation.ServiceDefaults/AppDefaultsExtensions.cs b/src/Elastic.Documentation.ServiceDefaults/AppDefaultsExtensions.cs index eae34aeac..4b3a497eb 100644 --- a/src/Elastic.Documentation.ServiceDefaults/AppDefaultsExtensions.cs +++ b/src/Elastic.Documentation.ServiceDefaults/AppDefaultsExtensions.cs @@ -45,6 +45,9 @@ public static TBuilder AddDocumentationServiceDefaults(this TBuilder b _ = builder.Services.AddElasticDocumentationLogging(globalArgs.LogLevel, noConsole: globalArgs.IsMcp); _ = services.AddSingleton(globalArgs); + var endpoints = ElasticsearchEndpointFactory.Create(builder.Configuration); + _ = services.AddSingleton(endpoints); + return builder.AddServiceDefaults(); } diff --git a/src/Elastic.Documentation.ServiceDefaults/Elastic.Documentation.ServiceDefaults.csproj b/src/Elastic.Documentation.ServiceDefaults/Elastic.Documentation.ServiceDefaults.csproj index 4357d65ce..ef3d8edd8 100644 --- a/src/Elastic.Documentation.ServiceDefaults/Elastic.Documentation.ServiceDefaults.csproj +++ b/src/Elastic.Documentation.ServiceDefaults/Elastic.Documentation.ServiceDefaults.csproj @@ -17,6 +17,7 @@ + diff --git a/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs b/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs new file mode 100644 index 000000000..418ff21ef --- /dev/null +++ b/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs @@ -0,0 +1,66 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using Elastic.Documentation.Configuration; +using Microsoft.Extensions.Configuration; + +namespace Elastic.Documentation.ServiceDefaults; + +/// Centralizes user-secrets + env-var reading for Elasticsearch configuration. +public static class ElasticsearchEndpointFactory +{ + private const string UserSecretsId = "72f50f33-6fb9-4d08-bff3-39568fe370b3"; + + /// + /// Creates from user secrets and environment variables. + /// Returns null when no URL is available. + /// + public static DocumentationEndpoints Create(IConfiguration? appConfiguration = null) + { + var configBuilder = new ConfigurationBuilder(); + _ = configBuilder.AddUserSecrets(UserSecretsId); + _ = configBuilder.AddEnvironmentVariables(); + var config = configBuilder.Build(); + + var url = + config["Parameters:DocumentationElasticUrl"] + ?? config["DOCUMENTATION_ELASTIC_URL"]; + + var apiKey = + config["Parameters:DocumentationElasticApiKey"] + ?? config["DOCUMENTATION_ELASTIC_APIKEY"]; + + var password = + config["Parameters:DocumentationElasticPassword"] + ?? config["DOCUMENTATION_ELASTIC_PASSWORD"]; + + var username = + config["Parameters:DocumentationElasticUsername"] + ?? config["DOCUMENTATION_ELASTIC_USERNAME"] + ?? "elastic"; + + if (string.IsNullOrEmpty(url)) + { + return new DocumentationEndpoints + { + Elasticsearch = new ElasticsearchEndpoint { Uri = new Uri("http://localhost:9200") } + }; + } + + var indexName = appConfiguration?["DOCUMENTATION_ELASTIC_INDEX"]; + + var endpoint = new ElasticsearchEndpoint + { + Uri = new Uri(url), + ApiKey = apiKey, + Password = password, + Username = username + }; + + if (indexName is not null) + endpoint.IndexName = indexName; + + return new DocumentationEndpoints { Elasticsearch = endpoint }; + } +} diff --git a/src/api/Elastic.Documentation.Api.App/Program.cs b/src/api/Elastic.Documentation.Api.App/Program.cs index 2165dead6..373ae9a46 100644 --- a/src/api/Elastic.Documentation.Api.App/Program.cs +++ b/src/api/Elastic.Documentation.Api.App/Program.cs @@ -4,8 +4,8 @@ using Elastic.Documentation.Api.Infrastructure; using Elastic.Documentation.Api.Infrastructure.OpenTelemetry; +using Elastic.Documentation.Configuration; using Elastic.Documentation.Configuration.Assembler; -using Elastic.Documentation.Search; using Elastic.Documentation.ServiceDefaults; using Microsoft.AspNetCore.Diagnostics; using Microsoft.AspNetCore.Diagnostics.HealthChecks; @@ -82,17 +82,18 @@ static void LogElasticsearchConfiguration(WebApplication app, ILogger logger) { try { - var esOptions = app.Services.GetService(); - if (esOptions != null) + var endpoints = app.Services.GetService(); + if (endpoints is not null) { + var endpoint = endpoints.Elasticsearch; logger.LogInformation( "Elasticsearch configuration - Url: {Url}, Index: {Index}", - esOptions.Url, - esOptions.IndexName + endpoint.Uri, + endpoint.IndexName ); } else - logger.LogWarning("ElasticsearchOptions could not be resolved from DI"); + logger.LogWarning("DocumentationEndpoints could not be resolved from DI"); } catch (Exception ex) { diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/AskAi/ElasticsearchAskAiMessageFeedbackGateway.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/AskAi/ElasticsearchAskAiMessageFeedbackGateway.cs index 345d07ba5..99e20e5ec 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/AskAi/ElasticsearchAskAiMessageFeedbackGateway.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/AskAi/ElasticsearchAskAiMessageFeedbackGateway.cs @@ -7,7 +7,7 @@ using Elastic.Clients.Elasticsearch.Serialization; using Elastic.Documentation.Api.Core; using Elastic.Documentation.Api.Core.AskAi; -using Elastic.Documentation.Search; +using Elastic.Documentation.Configuration; using Elastic.Transport; using Microsoft.Extensions.Logging; @@ -25,20 +25,27 @@ public sealed class ElasticsearchAskAiMessageFeedbackGateway : IAskAiMessageFeed private bool _disposed; public ElasticsearchAskAiMessageFeedbackGateway( - ElasticsearchOptions elasticsearchOptions, + DocumentationEndpoints endpoints, AppEnvironment appEnvironment, ILogger logger) { _logger = logger; _indexName = $"ask-ai-message-feedback-{appEnvironment.Current.ToStringFast(true)}"; - _nodePool = new SingleNodePool(new Uri(elasticsearchOptions.Url.Trim())); + var endpoint = endpoints.Elasticsearch; + _nodePool = new SingleNodePool(endpoint.Uri); + var auth = endpoint.ApiKey is { } apiKey + ? (AuthorizationHeader)new ApiKey(apiKey) + : endpoint is { Username: { } username, Password: { } password } + ? new BasicAuthentication(username, password) + : null!; + using var clientSettings = new ElasticsearchClientSettings( _nodePool, sourceSerializer: (_, settings) => new DefaultSourceSerializer(settings, MessageFeedbackJsonContext.Default) ) .DefaultIndex(_indexName) - .Authentication(new ApiKey(elasticsearchOptions.ApiKey)); + .Authentication(auth); _client = new ElasticsearchClient(clientSettings); } diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Gateways/DocumentGateway.cs b/src/api/Elastic.Documentation.Mcp.Remote/Gateways/DocumentGateway.cs index aef0f15cb..168926126 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/Gateways/DocumentGateway.cs +++ b/src/api/Elastic.Documentation.Mcp.Remote/Gateways/DocumentGateway.cs @@ -24,7 +24,7 @@ public class DocumentGateway( try { var response = await clientAccessor.Client.SearchAsync(s => s - .Indices(clientAccessor.Options.IndexName) + .Indices(clientAccessor.Endpoint.IndexName) .Query(q => q.Term(t => t.Field(f => f.Url.Suffix("keyword")).Value(url))) .Size(1) .Source(sf => sf.Filter(f => f.Includes( @@ -101,7 +101,7 @@ public class DocumentGateway( try { var response = await clientAccessor.Client.SearchAsync(s => s - .Indices(clientAccessor.Options.IndexName) + .Indices(clientAccessor.Endpoint.IndexName) .Query(q => q.Term(t => t.Field(f => f.Url.Suffix("keyword")).Value(url))) .Size(1) .Source(sf => sf.Filter(f => f.Includes( diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs index 42ddc27de..b9ae29394 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs +++ b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information using Elastic.Documentation.Api.Infrastructure.OpenTelemetry; +using Elastic.Documentation.Configuration; using Elastic.Documentation.Mcp.Remote.Gateways; using Elastic.Documentation.Mcp.Remote.Tools; using Elastic.Documentation.Search; @@ -82,17 +83,18 @@ static void LogElasticsearchConfiguration(WebApplication app, ILogger logger) { try { - var esOptions = app.Services.GetService(); - if (esOptions != null) + var endpoints = app.Services.GetService(); + if (endpoints is not null) { + var endpoint = endpoints.Elasticsearch; logger.LogInformation( "Elasticsearch configuration - Url: {Url}, Index: {Index}", - esOptions.Url, - esOptions.IndexName + endpoint.Uri, + endpoint.IndexName ); } else - logger.LogWarning("ElasticsearchOptions could not be resolved from DI"); + logger.LogWarning("DocumentationEndpoints could not be resolved from DI"); } catch (Exception ex) { diff --git a/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs b/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs index b49b02250..747408968 100644 --- a/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs +++ b/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs @@ -4,6 +4,7 @@ using Elastic.Clients.Elasticsearch; using Elastic.Clients.Elasticsearch.Serialization; +using Elastic.Documentation.Configuration; using Elastic.Documentation.Configuration.Search; using Elastic.Transport; @@ -18,31 +19,38 @@ public class ElasticsearchClientAccessor : IDisposable private readonly ElasticsearchClientSettings _clientSettings; private readonly SingleNodePool _nodePool; public ElasticsearchClient Client { get; } - public ElasticsearchOptions Options { get; } + public ElasticsearchEndpoint Endpoint { get; } public SearchConfiguration SearchConfiguration { get; } public string? RulesetName { get; } public IReadOnlyDictionary SynonymBiDirectional { get; } public IReadOnlyCollection DiminishTerms { get; } public ElasticsearchClientAccessor( - ElasticsearchOptions elasticsearchOptions, + DocumentationEndpoints endpoints, SearchConfiguration searchConfiguration) { - Options = elasticsearchOptions; + var endpoint = endpoints.Elasticsearch; + Endpoint = endpoint; SearchConfiguration = searchConfiguration; SynonymBiDirectional = searchConfiguration.SynonymBiDirectional; DiminishTerms = searchConfiguration.DiminishTerms; RulesetName = searchConfiguration.Rules.Count > 0 - ? ExtractRulesetName(elasticsearchOptions.IndexName) + ? ExtractRulesetName(endpoint.IndexName) : null; - _nodePool = new SingleNodePool(new Uri(elasticsearchOptions.Url.Trim())); + _nodePool = new SingleNodePool(endpoint.Uri); + var auth = endpoint.ApiKey is { } apiKey + ? (AuthorizationHeader)new ApiKey(apiKey) + : endpoint is { Username: { } username, Password: { } password } + ? new BasicAuthentication(username, password) + : null!; + _clientSettings = new ElasticsearchClientSettings( _nodePool, sourceSerializer: (_, settings) => new DefaultSourceSerializer(settings, EsJsonContext.Default) ) - .DefaultIndex(elasticsearchOptions.IndexName) - .Authentication(new ApiKey(elasticsearchOptions.ApiKey)); + .DefaultIndex(endpoint.IndexName) + .Authentication(auth); Client = new ElasticsearchClient(_clientSettings); } diff --git a/src/services/Elastic.Documentation.Search/Elastic.Documentation.Search.csproj b/src/services/Elastic.Documentation.Search/Elastic.Documentation.Search.csproj index 27eb575bc..8a350648d 100644 --- a/src/services/Elastic.Documentation.Search/Elastic.Documentation.Search.csproj +++ b/src/services/Elastic.Documentation.Search/Elastic.Documentation.Search.csproj @@ -17,7 +17,6 @@ - diff --git a/src/services/Elastic.Documentation.Search/ElasticsearchOptions.cs b/src/services/Elastic.Documentation.Search/ElasticsearchOptions.cs deleted file mode 100644 index 9327ae816..000000000 --- a/src/services/Elastic.Documentation.Search/ElasticsearchOptions.cs +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -using Microsoft.Extensions.Configuration; - -namespace Elastic.Documentation.Search; - -public class ElasticsearchOptions -{ - public ElasticsearchOptions(IConfiguration configuration) - { - // Build a new ConfigurationBuilder to read user secrets - var configBuilder = new ConfigurationBuilder(); - _ = configBuilder.AddUserSecrets("72f50f33-6fb9-4d08-bff3-39568fe370b3"); - var userSecretsConfig = configBuilder.Build(); - var elasticUrlFromSecret = userSecretsConfig["Parameters:DocumentationElasticUrl"]; - var elasticApiKeyFromSecret = userSecretsConfig["Parameters:DocumentationElasticApiKey"]; - - Url = GetEnv("DOCUMENTATION_ELASTIC_URL", elasticUrlFromSecret); - ApiKey = GetEnv("DOCUMENTATION_ELASTIC_APIKEY", elasticApiKeyFromSecret); - IndexName = configuration["DOCUMENTATION_ELASTIC_INDEX"] ?? "semantic-docs-dev-latest"; - } - - private static string GetEnv(string name, string? defaultValue = null) - { - var value = Environment.GetEnvironmentVariable(name); - if (!string.IsNullOrEmpty(value)) - return value; - if (defaultValue != null) - return defaultValue; - throw new ArgumentException($"Environment variable '{name}' not found."); - } - - // Read from environment variables (set by Terraform from SSM at deploy time) - public string Url { get; } - public string ApiKey { get; } - public string IndexName { get; } -} diff --git a/src/services/Elastic.Documentation.Search/FullSearchGateway.cs b/src/services/Elastic.Documentation.Search/FullSearchGateway.cs index d21dfb05b..af0b5063e 100644 --- a/src/services/Elastic.Documentation.Search/FullSearchGateway.cs +++ b/src/services/Elastic.Documentation.Search/FullSearchGateway.cs @@ -104,7 +104,7 @@ private async Task SearchWithHybridRrf(FullSearchRequest reque var response = await clientAccessor.Client.SearchAsync(s => { _ = s - .Indices(clientAccessor.Options.IndexName) + .Indices(clientAccessor.Endpoint.IndexName) .From(Math.Max(request.PageNumber - 1, 0) * request.PageSize) .Size(request.PageSize) .Query(filteredQuery) @@ -193,7 +193,7 @@ private async Task SearchLexicalOnly(FullSearchRequest request var response = await clientAccessor.Client.SearchAsync(s => { _ = s - .Indices(clientAccessor.Options.IndexName) + .Indices(clientAccessor.Endpoint.IndexName) .From(Math.Max(request.PageNumber - 1, 0) * request.PageSize) .Size(request.PageSize) .Query(filteredQuery) diff --git a/src/services/Elastic.Documentation.Search/NavigationSearchGateway.cs b/src/services/Elastic.Documentation.Search/NavigationSearchGateway.cs index a8c3710ae..797dd0121 100644 --- a/src/services/Elastic.Documentation.Search/NavigationSearchGateway.cs +++ b/src/services/Elastic.Documentation.Search/NavigationSearchGateway.cs @@ -46,7 +46,7 @@ public async Task SearchImplementation(string query, int var response = await clientAccessor.Client.SearchAsync(s => { _ = s - .Indices(clientAccessor.Options.IndexName) + .Indices(clientAccessor.Endpoint.IndexName) .From(Math.Max(pageNumber - 1, 0) * pageSize) .Size(pageSize) .Query(lexicalQuery) @@ -167,7 +167,7 @@ public async Task ExplainDocumentAsync(string query, string docum { // First, find the document by URL var getDocResponse = await clientAccessor.Client.SearchAsync(s => s - .Indices(clientAccessor.Options.IndexName) + .Indices(clientAccessor.Endpoint.IndexName) .Query(q => q.Term(t => t.Field(f => f.Url).Value(documentUrl))) .Size(1), ctx); @@ -186,7 +186,7 @@ public async Task ExplainDocumentAsync(string query, string docum // Now explain why this document matches (or doesn't match) the query var explainResponse = await clientAccessor.Client.ExplainAsync( - clientAccessor.Options.IndexName, documentId, e => e.Query(combinedQuery), ctx); + clientAccessor.Endpoint.IndexName, documentId, e => e.Query(combinedQuery), ctx); if (!explainResponse.IsValidResponse) { diff --git a/src/services/Elastic.Documentation.Search/ServicesExtension.cs b/src/services/Elastic.Documentation.Search/ServicesExtension.cs index 9b505c6f2..99e6619ae 100644 --- a/src/services/Elastic.Documentation.Search/ServicesExtension.cs +++ b/src/services/Elastic.Documentation.Search/ServicesExtension.cs @@ -25,8 +25,6 @@ public static IServiceCollection AddSearchServices(this IServiceCollection servi var logger = GetLogger(services); logger?.LogInformation("Configuring Search services"); - // Shared Elasticsearch options - DI auto-resolves IConfiguration from primary constructor - _ = services.AddSingleton(); _ = services.AddSingleton(); // Navigation Search (autocomplete/navigation search) diff --git a/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs b/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs index 9e696c984..c7f0db5a8 100644 --- a/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs +++ b/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs @@ -8,6 +8,7 @@ using Elastic.Documentation.Aspire; using Elastic.Documentation.Configuration; using Elastic.Documentation.Search; +using Elastic.Documentation.ServiceDefaults; using Elastic.Ingest.Elasticsearch; using Elastic.Ingest.Elasticsearch.Indices; using Elastic.Mapping; @@ -15,7 +16,6 @@ using Elastic.Transport; using Elastic.Transport.Products.Elasticsearch; using FluentAssertions; -using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; @@ -139,39 +139,12 @@ private async ValueTask IsIndexingNeeded() { try { - // Get Elasticsearch configuration from Aspire - var (elasticsearchUrl, apiKey, password, username) = GetElasticsearchConfiguration(); + var endpoints = ElasticsearchEndpointFactory.Create(); - if (string.IsNullOrEmpty(elasticsearchUrl)) - { - Console.WriteLine("No Elasticsearch URL configured, indexing will be performed."); - Connected = false; - return false; - } + var endpoint = endpoints.Elasticsearch; + Console.WriteLine($"Checking remote Elasticsearch at {endpoint.Uri} for existing data..."); - Console.WriteLine($"Checking remote Elasticsearch at {elasticsearchUrl} for existing data..."); - - // Create Elasticsearch endpoint configuration - var endpoint = new ElasticsearchEndpoint - { - Uri = new Uri(elasticsearchUrl), - ApiKey = apiKey, - Username = username, - Password = password - }; - - // Create transport configuration (similar to ElasticsearchMarkdownExporter) - var configuration = new ElasticsearchConfiguration(endpoint.Uri) - { - Authentication = endpoint.ApiKey is { } eApiKey - ? new ApiKey(eApiKey) - : endpoint is { Username: { } eUsername, Password: { } ePassword } - ? new BasicAuthentication(eUsername, ePassword) - : null, - EnableHttpCompression = true - }; - - var transport = new DistributedTransport(configuration); + var transport = ElasticsearchTransportFactory.Create(endpoint); Connected = (await transport.HeadAsync("/", TestContext.Current.CancellationToken)).ApiCallDetails.HasSuccessfulStatusCode; // Create a logger factory and diagnostics collector @@ -236,41 +209,6 @@ private async ValueTask ValidateResourceExitCode(string resourceName) Console.WriteLine($"{resourceName} completed with exit code 0"); } - /// - /// Gets Elasticsearch configuration from Aspire parameters and environment. - /// Manually reads user secrets from the aspire project, then falls back to environment variables. - /// - private (string? Url, string? ApiKey, string? Password, string? Username) GetElasticsearchConfiguration() - { - // Manually read user secrets from the aspire project - // UserSecretsId from aspire.csproj: 72f50f33-6fb9-4d08-bff3-39568fe370b3 - var configBuilder = new ConfigurationBuilder(); - configBuilder.AddUserSecrets("72f50f33-6fb9-4d08-bff3-39568fe370b3"); - var userSecretsConfig = configBuilder.Build(); - - // Get URL - try user secrets first, then Aspire configuration, then environment - var url = userSecretsConfig["Parameters:DocumentationElasticUrl"] - ?? fixture.DistributedApplication.Services.GetService()?["Parameters:DocumentationElasticUrl"] - ?? Environment.GetEnvironmentVariable("DOCUMENTATION_ELASTIC_URL"); - - // Get API Key - try user secrets first, then Aspire configuration, then environment - var apiKey = userSecretsConfig["Parameters:DocumentationElasticApiKey"] - ?? fixture.DistributedApplication.Services.GetService()?["Parameters:DocumentationElasticApiKey"] - ?? Environment.GetEnvironmentVariable("DOCUMENTATION_ELASTIC_APIKEY"); - - // Get password for local Elasticsearch (when using --start-elasticsearch) - var password = userSecretsConfig["Parameters:DocumentationElasticPassword"] ?? Environment.GetEnvironmentVariable("DOCUMENTATION_ELASTIC_PASSWORD"); - - // Get username (defaults to "elastic") - var username = userSecretsConfig["Parameters:DocumentationElasticUsername"] - ?? Environment.GetEnvironmentVariable("DOCUMENTATION_ELASTIC_USERNAME") - ?? "elastic"; - - Console.WriteLine($"Elasticsearch configuration retrieved: URL={url != null}, ApiKey={apiKey != null}, Password={password != null}"); - - return (url, apiKey, password, username); - } - public ValueTask DisposeAsync() { HttpClient?.Dispose(); diff --git a/tests-integration/Mcp.Remote.IntegrationTests/Mcp.Remote.IntegrationTests.csproj b/tests-integration/Mcp.Remote.IntegrationTests/Mcp.Remote.IntegrationTests.csproj index 2896294b0..8889bbcba 100644 --- a/tests-integration/Mcp.Remote.IntegrationTests/Mcp.Remote.IntegrationTests.csproj +++ b/tests-integration/Mcp.Remote.IntegrationTests/Mcp.Remote.IntegrationTests.csproj @@ -10,6 +10,7 @@ + diff --git a/tests-integration/Mcp.Remote.IntegrationTests/McpToolsIntegrationTestsBase.cs b/tests-integration/Mcp.Remote.IntegrationTests/McpToolsIntegrationTestsBase.cs index a6ec7c7dd..d2d0d696e 100644 --- a/tests-integration/Mcp.Remote.IntegrationTests/McpToolsIntegrationTestsBase.cs +++ b/tests-integration/Mcp.Remote.IntegrationTests/McpToolsIntegrationTestsBase.cs @@ -10,7 +10,7 @@ using Elastic.Documentation.Mcp.Remote.Tools; using Elastic.Documentation.Search; using Elastic.Documentation.Search.Common; -using Microsoft.Extensions.Configuration; +using Elastic.Documentation.ServiceDefaults; using Microsoft.Extensions.Logging.Abstractions; namespace Mcp.Remote.IntegrationTests; @@ -25,17 +25,16 @@ public abstract class McpToolsIntegrationTestsBase(ITestOutputHelper output) /// /// Creates SearchTools with all required dependencies. /// - protected (SearchTools? Tools, ElasticsearchClientAccessor? ClientAccessor) CreateSearchTools() + protected (SearchTools Tools, ElasticsearchClientAccessor? ClientAccessor) CreateSearchTools() { var clientAccessor = CreateElasticsearchClientAccessor(); - if (clientAccessor == null) - return (null, null); var productsConfig = CreateProductsConfiguration(); var fullSearchGateway = new FullSearchGateway( clientAccessor, productsConfig, - NullLogger.Instance); + NullLogger.Instance + ); var searchTools = new SearchTools(fullSearchGateway, NullLogger.Instance); return (searchTools, clientAccessor); @@ -44,16 +43,11 @@ public abstract class McpToolsIntegrationTestsBase(ITestOutputHelper output) /// /// Creates DocumentTools with all required dependencies. /// - protected (DocumentTools? Tools, ElasticsearchClientAccessor? ClientAccessor) CreateDocumentTools() + protected (DocumentTools Tools, ElasticsearchClientAccessor? ClientAccessor) CreateDocumentTools() { var clientAccessor = CreateElasticsearchClientAccessor(); - if (clientAccessor == null) - return (null, null); - - var documentGateway = new DocumentGateway( - clientAccessor, - NullLogger.Instance); + var documentGateway = new DocumentGateway(clientAccessor, NullLogger.Instance); var documentTools = new DocumentTools(documentGateway, NullLogger.Instance); return (documentTools, clientAccessor); } @@ -61,18 +55,12 @@ public abstract class McpToolsIntegrationTestsBase(ITestOutputHelper output) /// /// Creates CoherenceTools with all required dependencies. /// - protected (CoherenceTools? Tools, ElasticsearchClientAccessor? ClientAccessor) CreateCoherenceTools() + protected (CoherenceTools Tools, ElasticsearchClientAccessor? ClientAccessor) CreateCoherenceTools() { var clientAccessor = CreateElasticsearchClientAccessor(); - if (clientAccessor == null) - return (null, null); var productsConfig = CreateProductsConfiguration(); - var fullSearchGateway = new FullSearchGateway( - clientAccessor, - productsConfig, - NullLogger.Instance); - + var fullSearchGateway = new FullSearchGateway(clientAccessor, productsConfig, NullLogger.Instance); var coherenceTools = new CoherenceTools(fullSearchGateway, NullLogger.Instance); return (coherenceTools, clientAccessor); } @@ -80,34 +68,12 @@ public abstract class McpToolsIntegrationTestsBase(ITestOutputHelper output) /// /// Creates an ElasticsearchClientAccessor using configuration from user secrets and environment variables. /// - private static ElasticsearchClientAccessor? CreateElasticsearchClientAccessor() + private static ElasticsearchClientAccessor CreateElasticsearchClientAccessor() { - var configBuilder = new ConfigurationBuilder(); - configBuilder.AddUserSecrets("72f50f33-6fb9-4d08-bff3-39568fe370b3"); - configBuilder.AddEnvironmentVariables(); - var config = configBuilder.Build(); - - var elasticsearchUrl = - config["Parameters:DocumentationElasticUrl"] - ?? config["DOCUMENTATION_ELASTIC_URL"]; - - var elasticsearchApiKey = - config["Parameters:DocumentationElasticApiKey"] - ?? config["DOCUMENTATION_ELASTIC_APIKEY"]; - - if (string.IsNullOrEmpty(elasticsearchUrl) || string.IsNullOrEmpty(elasticsearchApiKey)) - return null; - - var testConfig = new ConfigurationBuilder() - .AddInMemoryCollection(new Dictionary - { - ["DOCUMENTATION_ELASTIC_URL"] = elasticsearchUrl, - ["DOCUMENTATION_ELASTIC_APIKEY"] = elasticsearchApiKey, - ["DOCUMENTATION_ELASTIC_INDEX"] = "semantic-docs-dev-latest" - }) - .Build(); - - var options = new ElasticsearchOptions(testConfig); + var endpoints = ElasticsearchEndpointFactory.Create(); + + endpoints.Elasticsearch.IndexName = "semantic-docs-dev-latest"; + var searchConfig = new SearchConfiguration { Synonyms = new Dictionary(), @@ -115,7 +81,7 @@ public abstract class McpToolsIntegrationTestsBase(ITestOutputHelper output) DiminishTerms = ["plugin", "client", "integration", "glossary"] }; - return new ElasticsearchClientAccessor(options, searchConfig); + return new ElasticsearchClientAccessor(endpoints, searchConfig); } /// diff --git a/tests-integration/Search.IntegrationTests/Search.IntegrationTests.csproj b/tests-integration/Search.IntegrationTests/Search.IntegrationTests.csproj index 0754b8a91..4763c5eb0 100644 --- a/tests-integration/Search.IntegrationTests/Search.IntegrationTests.csproj +++ b/tests-integration/Search.IntegrationTests/Search.IntegrationTests.csproj @@ -10,6 +10,7 @@ + diff --git a/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs b/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs index 3d93fd012..9047368e2 100644 --- a/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs +++ b/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs @@ -5,8 +5,8 @@ using Elastic.Documentation.Configuration.Search; using Elastic.Documentation.Search; using Elastic.Documentation.Search.Common; +using Elastic.Documentation.ServiceDefaults; using FluentAssertions; -using Microsoft.Extensions.Configuration; using Microsoft.Extensions.Logging.Abstractions; namespace Search.IntegrationTests; @@ -220,37 +220,12 @@ public async Task ExplainTopResultAndExpectedAsyncReturnsDetailedScoring() /// /// Creates an ElasticsearchGateway instance using configuration from the distributed application. /// - private NavigationSearchGateway? CreateFindPageGateway() + private NavigationSearchGateway CreateFindPageGateway() { - // Build a new ConfigurationBuilder to read user secrets and environment variables - var configBuilder = new ConfigurationBuilder(); - configBuilder.AddUserSecrets("72f50f33-6fb9-4d08-bff3-39568fe370b3"); - configBuilder.AddEnvironmentVariables(); - var config = configBuilder.Build(); - - // Get Elasticsearch configuration with fallback chain: user secrets → environment - var elasticsearchUrl = - config["Parameters:DocumentationElasticUrl"] - ?? config["DOCUMENTATION_ELASTIC_URL"]; - - var elasticsearchApiKey = - config["Parameters:DocumentationElasticApiKey"] - ?? config["DOCUMENTATION_ELASTIC_APIKEY"]; - - if (elasticsearchUrl is null or "" || elasticsearchApiKey is null or "") - return null; - - // Create IConfiguration with the required values for ElasticsearchOptions - var testConfig = new ConfigurationBuilder() - .AddInMemoryCollection(new Dictionary - { - ["DOCUMENTATION_ELASTIC_URL"] = elasticsearchUrl, - ["DOCUMENTATION_ELASTIC_APIKEY"] = elasticsearchApiKey, - ["DOCUMENTATION_ELASTIC_INDEX"] = "semantic-docs-dev-latest" - }) - .Build(); + var endpoints = ElasticsearchEndpointFactory.Create(); + + endpoints.Elasticsearch.IndexName = "semantic-docs-dev-latest"; - var options = new ElasticsearchOptions(testConfig); var searchConfig = new SearchConfiguration { Synonyms = new Dictionary(), @@ -278,7 +253,7 @@ public async Task ExplainTopResultAndExpectedAsyncReturnsDetailedScoring() DiminishTerms = ["plugin", "client", "integration", "glossary"] }; - var clientAccessor = new ElasticsearchClientAccessor(options, searchConfig); + var clientAccessor = new ElasticsearchClientAccessor(endpoints, searchConfig); return new NavigationSearchGateway(clientAccessor, NullLogger.Instance); } } From 22defec0369278be2ad66c88b01bb1e395318846 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Wed, 18 Feb 2026 16:09:12 +0100 Subject: [PATCH 03/15] Replace hardcoded IndexName with namespace-based index resolution Move mapping context (DocumentationMappingContext, LexicalConfig, SemanticConfig, DocumentationAnalysisFactory) from Elastic.Markdown to Elastic.Documentation so both indexing and search derive index names from the same source. Add ContentHash helper to avoid Elastic.Ingest.Elasticsearch dependency in Elastic.Documentation. Remove IndexName from ElasticsearchEndpoint, add Namespace to DocumentationEndpoints. ElasticsearchEndpointFactory resolves namespace from DOCUMENTATION_ELASTIC_INDEX env var (backward compat), DOTNET_ENVIRONMENT, ENVIRONMENT, or falls back to "dev". ElasticsearchClientAccessor derives SearchIndex and RulesetName from namespace instead of parsing the old IndexName string. Remove ExtractRulesetName and all hardcoded "semantic-docs-dev-latest" assignments from tests and config files. --- .../DocumentationEndpoints.cs | 7 +--- .../ElasticsearchEndpointFactory.cs | 38 ++++++++++++++++--- .../Search/ContentHash.cs | 19 ++++++++++ .../Search}/DocumentationMappingConfig.cs | 8 ++-- .../Elastic.Documentation.Api.App/Program.cs | 6 ++- .../Aws/LocalParameterProvider.cs | 4 -- .../Gateways/DocumentGateway.cs | 4 +- .../Program.cs | 6 ++- .../appsettings.development.json | 3 +- .../appsettings.edge.json | 3 +- .../Common/ElasticsearchClientAccessor.cs | 25 +++--------- .../FullSearchGateway.cs | 4 +- .../NavigationSearchGateway.cs | 6 +-- .../McpToolsIntegrationTestsBase.cs | 2 - .../SearchRelevanceTests.cs | 2 - 15 files changed, 79 insertions(+), 58 deletions(-) create mode 100644 src/Elastic.Documentation/Search/ContentHash.cs rename src/{Elastic.Markdown/Exporters/Elasticsearch => Elastic.Documentation/Search}/DocumentationMappingConfig.cs (96%) diff --git a/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs b/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs index 2bec0b94b..69282068f 100644 --- a/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs +++ b/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs @@ -9,6 +9,7 @@ namespace Elastic.Documentation.Configuration; public class DocumentationEndpoints { public required ElasticsearchEndpoint Elasticsearch { get; init; } + public string Namespace { get; set; } = "dev"; } public class ElasticsearchEndpoint @@ -28,12 +29,6 @@ public class ElasticsearchEndpoint // index options public string IndexNamePrefix { get; set; } = "semantic-docs"; - public string IndexName - { - get => field ?? $"{IndexNamePrefix}-dev-latest"; - set; - } - // channel buffer options public int BufferSize { get; set; } = 50; // Reduced for Serverless rate limits public int MaxRetries { get; set; } = 5; // Increased for 429 retries diff --git a/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs b/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs index 418ff21ef..d5e6cd6ac 100644 --- a/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs +++ b/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs @@ -48,8 +48,6 @@ public static DocumentationEndpoints Create(IConfiguration? appConfiguration = n }; } - var indexName = appConfiguration?["DOCUMENTATION_ELASTIC_INDEX"]; - var endpoint = new ElasticsearchEndpoint { Uri = new Uri(url), @@ -58,9 +56,39 @@ public static DocumentationEndpoints Create(IConfiguration? appConfiguration = n Username = username }; - if (indexName is not null) - endpoint.IndexName = indexName; + var ns = ResolveNamespace(config, appConfiguration, endpoint.IndexNamePrefix); + + return new DocumentationEndpoints { Elasticsearch = endpoint, Namespace = ns }; + } + + /// + /// Resolves the deployment namespace using this priority: + /// 1. DOCUMENTATION_ELASTIC_INDEX env var — strip prefix and -latest suffix + /// 2. DOTNET_ENVIRONMENT env var + /// 3. ENVIRONMENT env var + /// 4. Fallback: "dev" + /// + private static string ResolveNamespace(IConfiguration config, IConfiguration? appConfiguration, string indexNamePrefix) + { + var indexName = appConfiguration?["DOCUMENTATION_ELASTIC_INDEX"] + ?? config["DOCUMENTATION_ELASTIC_INDEX"]; + + if (!string.IsNullOrEmpty(indexName)) + { + var prefix = $"{indexNamePrefix}-"; + const string suffix = "-latest"; + if (indexName.StartsWith(prefix, StringComparison.OrdinalIgnoreCase) && + indexName.EndsWith(suffix, StringComparison.OrdinalIgnoreCase)) + { + var ns = indexName[prefix.Length..^suffix.Length]; + if (!string.IsNullOrEmpty(ns)) + return ns; + } + } + + var env = config["DOTNET_ENVIRONMENT"] + ?? config["ENVIRONMENT"]; - return new DocumentationEndpoints { Elasticsearch = endpoint }; + return !string.IsNullOrEmpty(env) ? env.ToLowerInvariant() : "dev"; } } diff --git a/src/Elastic.Documentation/Search/ContentHash.cs b/src/Elastic.Documentation/Search/ContentHash.cs new file mode 100644 index 000000000..17eb2e7ae --- /dev/null +++ b/src/Elastic.Documentation/Search/ContentHash.cs @@ -0,0 +1,19 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using System.Security.Cryptography; +using System.Text; + +namespace Elastic.Documentation.Search; + +/// Creates a short hex hash from one or more string components. +public static class ContentHash +{ + /// + /// Concatenates all components, computes SHA-256, and returns the first 16 hex characters (lowercased). + /// Compatible with HashedBulkUpdate.CreateHash. + /// + public static string Create(params string[] components) => + Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(string.Join("", components))))[..16].ToLowerInvariant(); +} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/DocumentationMappingConfig.cs b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs similarity index 96% rename from src/Elastic.Markdown/Exporters/Elasticsearch/DocumentationMappingConfig.cs rename to src/Elastic.Documentation/Search/DocumentationMappingConfig.cs index 9ae1c7072..b36db5ca4 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/DocumentationMappingConfig.cs +++ b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs @@ -2,12 +2,10 @@ // Elasticsearch B.V licenses this file to you under the Apache 2.0 License. // See the LICENSE file in the project root for more information -using Elastic.Documentation.Search; -using Elastic.Ingest.Elasticsearch.Indices; using Elastic.Mapping; using Elastic.Mapping.Analysis; -namespace Elastic.Markdown.Exporters.Elasticsearch; +namespace Elastic.Documentation.Search; [ElasticsearchMappingContext] [Entity( @@ -185,8 +183,8 @@ public static ElasticsearchTypeContext CreateContext( string? defaultPipeline = null) { var settingsJson = BuildSettingsJson(synonymSetName, indexTimeSynonyms, defaultPipeline); - var settingsHash = HashedBulkUpdate.CreateHash(settingsJson); - var hash = HashedBulkUpdate.CreateHash(settingsHash, baseContext.MappingsHash); + var settingsHash = ContentHash.Create(settingsJson); + var hash = ContentHash.Create(settingsHash, baseContext.MappingsHash); return baseContext.WithIndexName(indexName) with { diff --git a/src/api/Elastic.Documentation.Api.App/Program.cs b/src/api/Elastic.Documentation.Api.App/Program.cs index 373ae9a46..9ee9ef56d 100644 --- a/src/api/Elastic.Documentation.Api.App/Program.cs +++ b/src/api/Elastic.Documentation.Api.App/Program.cs @@ -86,10 +86,12 @@ static void LogElasticsearchConfiguration(WebApplication app, ILogger logger) if (endpoints is not null) { var endpoint = endpoints.Elasticsearch; + var searchIndex = $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-{endpoints.Namespace}-latest"; logger.LogInformation( - "Elasticsearch configuration - Url: {Url}, Index: {Index}", + "Elasticsearch configuration - Url: {Url}, Namespace: {Namespace}, SearchIndex: {SearchIndex}", endpoint.Uri, - endpoint.IndexName + endpoints.Namespace, + searchIndex ); } else diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Aws/LocalParameterProvider.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Aws/LocalParameterProvider.cs index 04db74d4d..8ec6d6d18 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Aws/LocalParameterProvider.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Aws/LocalParameterProvider.cs @@ -58,10 +58,6 @@ public async Task GetParam(string name, bool withDecryption = true, Canc { return GetEnv("DOCUMENTATION_KIBANA_APIKEY"); } - case "docs-elasticsearch-index": - { - return GetEnv("DOCUMENTATION_ELASTIC_INDEX", "semantic-docs-dev-latest"); - } default: { throw new ArgumentException($"Parameter '{name}' not found in {nameof(LocalParameterProvider)}"); diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Gateways/DocumentGateway.cs b/src/api/Elastic.Documentation.Mcp.Remote/Gateways/DocumentGateway.cs index 168926126..aeba0be88 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/Gateways/DocumentGateway.cs +++ b/src/api/Elastic.Documentation.Mcp.Remote/Gateways/DocumentGateway.cs @@ -24,7 +24,7 @@ public class DocumentGateway( try { var response = await clientAccessor.Client.SearchAsync(s => s - .Indices(clientAccessor.Endpoint.IndexName) + .Indices(clientAccessor.SearchIndex) .Query(q => q.Term(t => t.Field(f => f.Url.Suffix("keyword")).Value(url))) .Size(1) .Source(sf => sf.Filter(f => f.Includes( @@ -101,7 +101,7 @@ public class DocumentGateway( try { var response = await clientAccessor.Client.SearchAsync(s => s - .Indices(clientAccessor.Endpoint.IndexName) + .Indices(clientAccessor.SearchIndex) .Query(q => q.Term(t => t.Field(f => f.Url.Suffix("keyword")).Value(url))) .Size(1) .Source(sf => sf.Filter(f => f.Includes( diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs index b9ae29394..3fdde5ac5 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs +++ b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs @@ -87,10 +87,12 @@ static void LogElasticsearchConfiguration(WebApplication app, ILogger logger) if (endpoints is not null) { var endpoint = endpoints.Elasticsearch; + var searchIndex = $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-{endpoints.Namespace}-latest"; logger.LogInformation( - "Elasticsearch configuration - Url: {Url}, Index: {Index}", + "Elasticsearch configuration - Url: {Url}, Namespace: {Namespace}, SearchIndex: {SearchIndex}", endpoint.Uri, - endpoint.IndexName + endpoints.Namespace, + searchIndex ); } else diff --git a/src/api/Elastic.Documentation.Mcp.Remote/appsettings.development.json b/src/api/Elastic.Documentation.Mcp.Remote/appsettings.development.json index 15cac94ee..34f00ef13 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/appsettings.development.json +++ b/src/api/Elastic.Documentation.Mcp.Remote/appsettings.development.json @@ -4,6 +4,5 @@ "Default": "Debug", "Microsoft.AspNetCore": "Information" } - }, - "DOCUMENTATION_ELASTIC_INDEX": "semantic-docs-dev-latest" + } } diff --git a/src/api/Elastic.Documentation.Mcp.Remote/appsettings.edge.json b/src/api/Elastic.Documentation.Mcp.Remote/appsettings.edge.json index fe7d17f7b..0c208ae91 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/appsettings.edge.json +++ b/src/api/Elastic.Documentation.Mcp.Remote/appsettings.edge.json @@ -4,6 +4,5 @@ "Default": "Information", "Microsoft.AspNetCore": "Warning" } - }, - "DOCUMENTATION_ELASTIC_INDEX": "semantic-docs-edge-latest" + } } diff --git a/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs b/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs index 747408968..9ee24d757 100644 --- a/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs +++ b/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs @@ -21,6 +21,7 @@ public class ElasticsearchClientAccessor : IDisposable public ElasticsearchClient Client { get; } public ElasticsearchEndpoint Endpoint { get; } public SearchConfiguration SearchConfiguration { get; } + public string SearchIndex { get; } public string? RulesetName { get; } public IReadOnlyDictionary SynonymBiDirectional { get; } public IReadOnlyCollection DiminishTerms { get; } @@ -34,8 +35,11 @@ public ElasticsearchClientAccessor( SearchConfiguration = searchConfiguration; SynonymBiDirectional = searchConfiguration.SynonymBiDirectional; DiminishTerms = searchConfiguration.DiminishTerms; + + SearchIndex = $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-{endpoints.Namespace}-latest"; + RulesetName = searchConfiguration.Rules.Count > 0 - ? ExtractRulesetName(endpoint.IndexName) + ? $"docs-ruleset-{endpoints.Namespace}" : null; _nodePool = new SingleNodePool(endpoint.Uri); @@ -49,29 +53,12 @@ public ElasticsearchClientAccessor( _nodePool, sourceSerializer: (_, settings) => new DefaultSourceSerializer(settings, EsJsonContext.Default) ) - .DefaultIndex(endpoint.IndexName) + .DefaultIndex(SearchIndex) .Authentication(auth); Client = new ElasticsearchClient(_clientSettings); } - /// - /// Extracts the ruleset name from the index name. - /// Index name format: "semantic-docs-{namespace}-latest" -> ruleset: "docs-ruleset-{namespace}" - /// The namespace may contain hyphens (e.g., "codex-engineering"), so we extract everything - /// between the "semantic-docs-" prefix and the "-latest" suffix. - /// - private static string? ExtractRulesetName(string indexName) - { - const string prefix = "semantic-docs-"; - const string suffix = "-latest"; - if (!indexName.StartsWith(prefix, StringComparison.Ordinal) || !indexName.EndsWith(suffix, StringComparison.Ordinal)) - return null; - - var ns = indexName[prefix.Length..^suffix.Length]; - return string.IsNullOrEmpty(ns) ? null : $"docs-ruleset-{ns}"; - } - /// /// Tests connectivity to the Elasticsearch cluster. /// diff --git a/src/services/Elastic.Documentation.Search/FullSearchGateway.cs b/src/services/Elastic.Documentation.Search/FullSearchGateway.cs index af0b5063e..7d1d3a3cf 100644 --- a/src/services/Elastic.Documentation.Search/FullSearchGateway.cs +++ b/src/services/Elastic.Documentation.Search/FullSearchGateway.cs @@ -104,7 +104,7 @@ private async Task SearchWithHybridRrf(FullSearchRequest reque var response = await clientAccessor.Client.SearchAsync(s => { _ = s - .Indices(clientAccessor.Endpoint.IndexName) + .Indices(clientAccessor.SearchIndex) .From(Math.Max(request.PageNumber - 1, 0) * request.PageSize) .Size(request.PageSize) .Query(filteredQuery) @@ -193,7 +193,7 @@ private async Task SearchLexicalOnly(FullSearchRequest request var response = await clientAccessor.Client.SearchAsync(s => { _ = s - .Indices(clientAccessor.Endpoint.IndexName) + .Indices(clientAccessor.SearchIndex) .From(Math.Max(request.PageNumber - 1, 0) * request.PageSize) .Size(request.PageSize) .Query(filteredQuery) diff --git a/src/services/Elastic.Documentation.Search/NavigationSearchGateway.cs b/src/services/Elastic.Documentation.Search/NavigationSearchGateway.cs index 797dd0121..dcb20c3e5 100644 --- a/src/services/Elastic.Documentation.Search/NavigationSearchGateway.cs +++ b/src/services/Elastic.Documentation.Search/NavigationSearchGateway.cs @@ -46,7 +46,7 @@ public async Task SearchImplementation(string query, int var response = await clientAccessor.Client.SearchAsync(s => { _ = s - .Indices(clientAccessor.Endpoint.IndexName) + .Indices(clientAccessor.SearchIndex) .From(Math.Max(pageNumber - 1, 0) * pageSize) .Size(pageSize) .Query(lexicalQuery) @@ -167,7 +167,7 @@ public async Task ExplainDocumentAsync(string query, string docum { // First, find the document by URL var getDocResponse = await clientAccessor.Client.SearchAsync(s => s - .Indices(clientAccessor.Endpoint.IndexName) + .Indices(clientAccessor.SearchIndex) .Query(q => q.Term(t => t.Field(f => f.Url).Value(documentUrl))) .Size(1), ctx); @@ -186,7 +186,7 @@ public async Task ExplainDocumentAsync(string query, string docum // Now explain why this document matches (or doesn't match) the query var explainResponse = await clientAccessor.Client.ExplainAsync( - clientAccessor.Endpoint.IndexName, documentId, e => e.Query(combinedQuery), ctx); + clientAccessor.SearchIndex, documentId, e => e.Query(combinedQuery), ctx); if (!explainResponse.IsValidResponse) { diff --git a/tests-integration/Mcp.Remote.IntegrationTests/McpToolsIntegrationTestsBase.cs b/tests-integration/Mcp.Remote.IntegrationTests/McpToolsIntegrationTestsBase.cs index d2d0d696e..67f8570a6 100644 --- a/tests-integration/Mcp.Remote.IntegrationTests/McpToolsIntegrationTestsBase.cs +++ b/tests-integration/Mcp.Remote.IntegrationTests/McpToolsIntegrationTestsBase.cs @@ -72,8 +72,6 @@ private static ElasticsearchClientAccessor CreateElasticsearchClientAccessor() { var endpoints = ElasticsearchEndpointFactory.Create(); - endpoints.Elasticsearch.IndexName = "semantic-docs-dev-latest"; - var searchConfig = new SearchConfiguration { Synonyms = new Dictionary(), diff --git a/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs b/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs index 9047368e2..34246dc35 100644 --- a/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs +++ b/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs @@ -224,8 +224,6 @@ private NavigationSearchGateway CreateFindPageGateway() { var endpoints = ElasticsearchEndpointFactory.Create(); - endpoints.Elasticsearch.IndexName = "semantic-docs-dev-latest"; - var searchConfig = new SearchConfiguration { Synonyms = new Dictionary(), From 736af109875b9a90aa981af67ad627de5d9e95c8 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Sun, 22 Feb 2026 16:26:13 +0100 Subject: [PATCH 04/15] Update Elastic.Ingest.Elasticsearch and Elastic.Mapping to 0.24.0 Enable IndexPatternUseBatchDate now that Elastic.Mapping supports it, and pass batchTimestamp to IngestChannelOptions in the lexical-only path so the channel uses the exporter's timestamp for index name computation. --- Directory.Packages.props | 4 +- .../Search/DocumentationMappingConfig.cs | 39 +++++-------------- .../ElasticsearchMarkdownExporter.cs | 2 +- 3 files changed, 12 insertions(+), 33 deletions(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index 04966e308..58aadb514 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -48,8 +48,8 @@ - - + + diff --git a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs index b36db5ca4..ec5bb391c 100644 --- a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs +++ b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs @@ -146,34 +146,10 @@ public static AnalysisBuilder BuildAnalysis(AnalysisBuilder analysis, string syn .Tokenizer("path_tokenizer", t => t.PathHierarchy() .Delimiter('/')); - /// - /// Creates the index settings JSON with analysis configuration and optional default pipeline. - /// - public static string BuildSettingsJson(string synonymSetName, string[] indexTimeSynonyms, string? defaultPipeline = null) - { - var analysis = BuildAnalysis(new AnalysisBuilder(), synonymSetName, indexTimeSynonyms); - var analysisJson = analysis.Build().ToJsonString(); - - if (defaultPipeline is not null) - { - // Merge default_pipeline into the settings JSON - return $$""" - { - "default_pipeline": "{{defaultPipeline}}", - "analysis": {{analysisJson}} - } - """; - } - - return $$""" - { - "analysis": {{analysisJson}} - } - """; - } - /// /// Creates an ElasticsearchTypeContext with runtime analysis settings and dynamic index name. + /// Analysis is provided via , which + /// Elastic.Ingest.Elasticsearch merges into the settings automatically. /// public static ElasticsearchTypeContext CreateContext( ElasticsearchTypeContext baseContext, @@ -182,16 +158,19 @@ public static ElasticsearchTypeContext CreateContext( string[] indexTimeSynonyms, string? defaultPipeline = null) { - var settingsJson = BuildSettingsJson(synonymSetName, indexTimeSynonyms, defaultPipeline); - var settingsHash = ContentHash.Create(settingsJson); + var analysisJson = BuildAnalysis(new AnalysisBuilder(), synonymSetName, indexTimeSynonyms).Build().ToJsonString(); + var settingsHash = ContentHash.Create(analysisJson, defaultPipeline ?? ""); var hash = ContentHash.Create(settingsHash, baseContext.MappingsHash); return baseContext.WithIndexName(indexName) with { - GetSettingsJson = () => settingsJson, + GetSettingsJson = defaultPipeline is not null + ? () => $$"""{ "default_pipeline": "{{defaultPipeline}}" }""" + : () => "{}", SettingsHash = settingsHash, Hash = hash, - ConfigureAnalysis = a => BuildAnalysis(a, synonymSetName, indexTimeSynonyms) + ConfigureAnalysis = a => BuildAnalysis(a, synonymSetName, indexTimeSynonyms), + IndexPatternUseBatchDate = true }; } } diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs index 84c0e5c75..0bfa52ef2 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs @@ -134,7 +134,7 @@ IDocumentationConfigurationContext context else { _batchIndexDate = DateTimeOffset.UtcNow; - var options = new IngestChannelOptions(_transport, _lexicalTypeContext); + var options = new IngestChannelOptions(_transport, _lexicalTypeContext, _batchIndexDate); ConfigureChannelOptions(options); _lexicalOnlyChannel = new IngestChannel(options); } From 9a8799376732ee8a43de576b5c6b614ad2b0020b Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Sun, 22 Feb 2026 16:27:43 +0100 Subject: [PATCH 05/15] Use centralized ElasticsearchEndpointFactory and add skipOpenApi parameter Simplify DocumentationTooling endpoint resolution by delegating to ElasticsearchEndpointFactory. Add missing skipOpenApi parameter to IsolatedIndexService.Index call. --- .../IsolatedIndexService.cs | 1 + .../docs-builder/DocumentationTooling.cs | 30 +++---------------- 2 files changed, 5 insertions(+), 26 deletions(-) diff --git a/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs b/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs index c4fcc6c0f..81d389c69 100644 --- a/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs +++ b/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs @@ -114,6 +114,7 @@ public async Task Index(IDiagnosticsCollector collector, return await Build(collector, fileSystem, metadataOnly: true, strict: false, path: path, output: null, pathPrefix: null, force: true, allowIndexing: null, exporters: exporters, canonicalBaseUrl: null, + skipOpenApi: true, ctx: ctx); } } diff --git a/src/tooling/docs-builder/DocumentationTooling.cs b/src/tooling/docs-builder/DocumentationTooling.cs index 298d82cd2..f8b3a952c 100644 --- a/src/tooling/docs-builder/DocumentationTooling.cs +++ b/src/tooling/docs-builder/DocumentationTooling.cs @@ -14,6 +14,7 @@ using Elastic.Documentation.Configuration.Search; using Elastic.Documentation.Configuration.Versions; using Elastic.Documentation.Diagnostics; +using Elastic.Documentation.ServiceDefaults; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Logging; @@ -42,33 +43,10 @@ public static TBuilder AddDocumentationToolingDefaults(this TBuilder b return new DiagnosticsCollector([]); return new ConsoleDiagnosticsCollector(logFactory, githubActionsService); }) - .AddSingleton(sp => + .AddSingleton(_ => { - var resolver = sp.GetRequiredService(); - var elasticsearchUri = ResolveServiceEndpoint(resolver, - () => TryEnvVars("http://localhost:9200", "DOCUMENTATION_ELASTIC_URL", "CONNECTIONSTRINGS__ELASTICSEARCH") - ); - var elasticsearchPassword = - elasticsearchUri.UserInfo is { } userInfo && userInfo.Contains(':') - ? userInfo.Split(':')[1] - : TryEnvVarsOptional("DOCUMENTATION_ELASTIC_PASSWORD"); - - var elasticsearchUser = - elasticsearchUri.UserInfo is { } userInfo2 && userInfo2.Contains(':') - ? userInfo2.Split(':')[0] - : TryEnvVars("elastic", "DOCUMENTATION_ELASTIC_USERNAME"); - - var elasticsearchApiKey = TryEnvVarsOptional("DOCUMENTATION_ELASTIC_APIKEY"); - return new DocumentationEndpoints - { - Elasticsearch = new ElasticsearchEndpoint - { - Uri = elasticsearchUri, - Password = elasticsearchPassword, - ApiKey = elasticsearchApiKey, - Username = elasticsearchUser - }, - }; + var endpoints = ElasticsearchEndpointFactory.Create(builder.Configuration); + return endpoints; }) .AddSingleton(sp => { From 3e8034abcc9e0d57a278e66ba3b48795c16e6999 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Sun, 22 Feb 2026 16:53:53 +0100 Subject: [PATCH 06/15] Remove --no-semantic flag entirely The lexical-only code path manually reimplemented drain, delete-stale, refresh, and alias logic that the orchestrator handles automatically. Remove the flag end-to-end: CLI parameters, configuration, exporter branching, and CLI documentation. --- docs/cli/assembler/assembler-index.md | 3 - docs/cli/docset/index-command.md | 3 - .../DocumentationEndpoints.cs | 1 - .../ElasticsearchEndpointConfigurator.cs | 3 - .../ElasticsearchMarkdownExporter.cs | 130 +++++------------- .../Indexing/AssemblerIndexService.cs | 3 - .../IsolatedIndexService.cs | 3 - .../Assembler/AssemblerIndexCommand.cs | 6 +- .../Commands/Codex/CodexIndexCommand.cs | 3 - .../docs-builder/Commands/IndexCommand.cs | 6 +- 10 files changed, 38 insertions(+), 123 deletions(-) diff --git a/docs/cli/assembler/assembler-index.md b/docs/cli/assembler/assembler-index.md index 5d551e4b4..8ae72ddcd 100644 --- a/docs/cli/assembler/assembler-index.md +++ b/docs/cli/assembler/assembler-index.md @@ -29,9 +29,6 @@ docs-builder assembler index [options...] [-h|--help] [--version] `--password` `` : Elasticsearch password (basic auth), alternatively set env DOCUMENTATION_ELASTIC_PASSWORD (optional) -`--no-semantic` `` -: Index without semantic fields (optional) - `--search-num-threads` `` : The number of search threads the inference endpoint should use. Defaults: 8 (optional) diff --git a/docs/cli/docset/index-command.md b/docs/cli/docset/index-command.md index 32aa3a25b..00e28cf1c 100644 --- a/docs/cli/docset/index-command.md +++ b/docs/cli/docset/index-command.md @@ -25,9 +25,6 @@ docs-builder index [options...] [-h|--help] [--version] `--password` `` : Elasticsearch password (basic auth), alternatively set env DOCUMENTATION_ELASTIC_PASSWORD (optional) -`--no-semantic` `` -: Index without semantic fields (optional) - `--search-num-threads` `` : The number of search threads the inference endpoint should use. Defaults: 8 (optional) diff --git a/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs b/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs index 69282068f..3636c0478 100644 --- a/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs +++ b/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs @@ -44,7 +44,6 @@ public class ElasticsearchEndpoint public X509Certificate? Certificate { get; set; } public bool CertificateIsNotRoot { get; set; } public int? BootstrapTimeout { get; set; } - public bool NoSemantic { get; set; } public bool ForceReindex { get; set; } /// diff --git a/src/Elastic.Documentation.Configuration/ElasticsearchEndpointConfigurator.cs b/src/Elastic.Documentation.Configuration/ElasticsearchEndpointConfigurator.cs index e8031368c..035fe559c 100644 --- a/src/Elastic.Documentation.Configuration/ElasticsearchEndpointConfigurator.cs +++ b/src/Elastic.Documentation.Configuration/ElasticsearchEndpointConfigurator.cs @@ -21,7 +21,6 @@ public record ElasticsearchIndexOptions public string? Password { get; init; } // inference options - public bool? NoSemantic { get; init; } public bool? EnableAiEnrichment { get; init; } public int? SearchNumThreads { get; init; } public int? IndexNumThreads { get; init; } @@ -117,8 +116,6 @@ public static async Task ApplyAsync( if (options.BootstrapTimeout.HasValue) cfg.BootstrapTimeout = options.BootstrapTimeout.Value; - if (options.NoSemantic.HasValue) - cfg.NoSemantic = options.NoSemantic.Value; if (options.EnableAiEnrichment.HasValue) cfg.EnableAiEnrichment = options.EnableAiEnrichment.Value; if (options.ForceReindex.HasValue) diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs index 0bfa52ef2..300a1a6bb 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs @@ -30,16 +30,12 @@ public partial class ElasticsearchMarkdownExporter : IMarkdownExporter, IDisposa private readonly string _indexNamespace; private readonly DateTimeOffset _batchIndexDate; - // Ingest: orchestrator for dual-index mode, plain channel for --no-semantic - private readonly IncrementalSyncOrchestrator? _orchestrator; - private readonly IngestChannel? _lexicalOnlyChannel; + // Ingest: orchestrator for dual-index mode + private readonly IncrementalSyncOrchestrator _orchestrator; // Type context hashes for document content hash computation private readonly ElasticsearchTypeContext _lexicalTypeContext; - private readonly ElasticsearchTypeContext? _semanticTypeContext; - - // Alias names for queries/statistics - private readonly string _lexicalAlias; + private readonly ElasticsearchTypeContext _semanticTypeContext; private readonly IReadOnlyDictionary _synonyms; private readonly IReadOnlyCollection _rules; @@ -91,11 +87,11 @@ IDocumentationConfigurationContext context var synonymSetName = $"docs-{indexNamespace}"; var ns = indexNamespace.ToLowerInvariant(); var lexicalPrefix = es.IndexNamePrefix.Replace("semantic", "lexical").ToLowerInvariant(); - _lexicalAlias = $"{lexicalPrefix}-{ns}"; + var lexicalAlias = $"{lexicalPrefix}-{ns}"; _lexicalTypeContext = DocumentationAnalysisFactory.CreateContext( DocumentationMappingContext.DocumentationDocument.Context, - _lexicalAlias, synonymSetName, indexTimeSynonyms, aiPipeline + lexicalAlias, synonymSetName, indexTimeSynonyms, aiPipeline ); // Initialize AI enrichment services if enabled @@ -106,38 +102,28 @@ IDocumentationConfigurationContext context _enrichPolicyManager = new EnrichPolicyManager(_transport, logFactory.CreateLogger(), _enrichmentCache.IndexName); } - if (!es.NoSemantic) - { - var semanticAlias = $"{es.IndexNamePrefix.ToLowerInvariant()}-{ns}"; - _semanticTypeContext = DocumentationAnalysisFactory.CreateContext( - DocumentationMappingContext.DocumentationDocumentSemantic.Context, - semanticAlias, synonymSetName, indexTimeSynonyms, aiPipeline - ); - - _orchestrator = new IncrementalSyncOrchestrator(_transport, _lexicalTypeContext, _semanticTypeContext) - { - ConfigurePrimary = ConfigureChannelOptions, - ConfigureSecondary = ConfigureChannelOptions, - OnPostComplete = es.EnableAiEnrichment - ? async (ctx, ct) => await PostCompleteAsync(ctx, ct) - : null - }; - _ = _orchestrator.AddPreBootstrapTask(async (_, ct) => - { - await InitializeEnrichmentAsync(ct); - await PublishSynonymsAsync(ct); - await PublishQueryRulesAsync(ct); - }); + var semanticAlias = $"{es.IndexNamePrefix.ToLowerInvariant()}-{ns}"; + _semanticTypeContext = DocumentationAnalysisFactory.CreateContext( + DocumentationMappingContext.DocumentationDocumentSemantic.Context, + semanticAlias, synonymSetName, indexTimeSynonyms, aiPipeline + ); - _batchIndexDate = _orchestrator.BatchTimestamp; - } - else + _orchestrator = new IncrementalSyncOrchestrator(_transport, _lexicalTypeContext, _semanticTypeContext) { - _batchIndexDate = DateTimeOffset.UtcNow; - var options = new IngestChannelOptions(_transport, _lexicalTypeContext, _batchIndexDate); - ConfigureChannelOptions(options); - _lexicalOnlyChannel = new IngestChannel(options); - } + ConfigurePrimary = ConfigureChannelOptions, + ConfigureSecondary = ConfigureChannelOptions, + OnPostComplete = es.EnableAiEnrichment + ? async (ctx, ct) => await PostCompleteAsync(ctx, ct) + : null + }; + _ = _orchestrator.AddPreBootstrapTask(async (_, ct) => + { + await InitializeEnrichmentAsync(ct); + await PublishSynonymsAsync(ct); + await PublishQueryRulesAsync(ct); + }); + + _batchIndexDate = _orchestrator.BatchTimestamp; } private void ConfigureChannelOptions(IngestChannelOptions options) @@ -167,51 +153,13 @@ private void ConfigureChannelOptions(IngestChannelOptions /// public async ValueTask StartAsync(Cancel ctx = default) { - if (_orchestrator is not null) - { - _ = await _orchestrator.StartAsync(BootstrapMethod.Failure, ctx); - _logger.LogInformation("Orchestrator started with {Strategy} strategy", _orchestrator.Strategy); - return; - } - - // NoSemantic path - await InitializeEnrichmentAsync(ctx); - await PublishSynonymsAsync(ctx); - await PublishQueryRulesAsync(ctx); - _ = await _lexicalOnlyChannel!.BootstrapElasticsearchAsync(BootstrapMethod.Failure, ctx); + _ = await _orchestrator.StartAsync(BootstrapMethod.Failure, ctx); + _logger.LogInformation("Orchestrator started with {Strategy} strategy", _orchestrator.Strategy); } /// - public async ValueTask StopAsync(Cancel ctx = default) - { - if (_orchestrator is not null) - { - _ = await _orchestrator.CompleteAsync(null, ctx); - return; - } - - // NoSemantic path — drain, delete stale, refresh, alias - var drained = await _lexicalOnlyChannel!.WaitForDrainAsync(null, ctx); - if (!drained) - _collector.EmitGlobalError("Elasticsearch export: failed to drain in a timely fashion"); - - // Delete stale documents not part of this batch - var deleteQuery = PostData.String($$""" - { - "query": { - "range": { - "batch_index_date": { - "lt": "{{_batchIndexDate:o}}" - } - } - } - } - """); - await _operations.DeleteByQueryAsync(_lexicalAlias, deleteQuery, ctx); - - _ = await _lexicalOnlyChannel.RefreshAsync(ctx); - _ = await _lexicalOnlyChannel.ApplyAliasesAsync(_lexicalAlias, ctx); - } + public async ValueTask StopAsync(Cancel ctx = default) => + _ = await _orchestrator.CompleteAsync(null, ctx); private async Task InitializeEnrichmentAsync(Cancel ctx) { @@ -251,7 +199,7 @@ private async ValueTask ExecuteEnrichPolicyIfNeededAsync(string? semanticAlias, private async ValueTask BackfillMissingAiFieldsAsync(string semanticAlias, Cancel ctx) { - if (_endpoint.NoSemantic || _enrichmentCache is null || _llmClient is null) + if (_enrichmentCache is null || _llmClient is null) return; var currentPromptHash = ElasticsearchLlmClient.PromptHash; @@ -354,26 +302,16 @@ private async Task PutQueryRuleset(QueryRuleset ruleset, string rulesetName, Can internal async ValueTask WriteDocumentAsync(DocumentationDocument doc, Cancel ctx) { - if (_orchestrator is not null) - { - if (_orchestrator.TryWrite(doc)) - return true; - _ = await _orchestrator.WaitToWriteAsync(doc, ctx); - return true; - } - - if (_lexicalOnlyChannel!.TryWrite(doc)) + if (_orchestrator.TryWrite(doc)) return true; - if (await _lexicalOnlyChannel.WaitToWriteAsync(ctx)) - return _lexicalOnlyChannel.TryWrite(doc); - return false; + _ = await _orchestrator.WaitToWriteAsync(doc, ctx); + return true; } /// public void Dispose() { - _orchestrator?.Dispose(); - _lexicalOnlyChannel?.Dispose(); + _orchestrator.Dispose(); _llmClient?.Dispose(); GC.SuppressFinalize(this); } diff --git a/src/services/Elastic.Documentation.Assembler/Indexing/AssemblerIndexService.cs b/src/services/Elastic.Documentation.Assembler/Indexing/AssemblerIndexService.cs index 1e44b5c93..cb84652c2 100644 --- a/src/services/Elastic.Documentation.Assembler/Indexing/AssemblerIndexService.cs +++ b/src/services/Elastic.Documentation.Assembler/Indexing/AssemblerIndexService.cs @@ -32,7 +32,6 @@ ICoreService githubActionsService /// Elasticsearch API key, alternatively set env DOCUMENTATION_ELASTIC_APIKEY /// Elasticsearch username (basic auth), alternatively set env DOCUMENTATION_ELASTIC_USERNAME /// Elasticsearch password (basic auth), alternatively set env DOCUMENTATION_ELASTIC_PASSWORD - /// Index without semantic fields /// Enable AI enrichment of documents using LLM-generated metadata /// The number of search threads the inference endpoint should use. Defaults: 8 /// The number of index threads the inference endpoint should use. Defaults: 8 @@ -60,7 +59,6 @@ public async Task Index(IDiagnosticsCollector collector, string? username = null, string? password = null, // inference options - bool? noSemantic = null, bool? enableAiEnrichment = null, int? searchNumThreads = null, int? indexNumThreads = null, @@ -91,7 +89,6 @@ public async Task Index(IDiagnosticsCollector collector, ApiKey = apiKey, Username = username, Password = password, - NoSemantic = noSemantic, EnableAiEnrichment = enableAiEnrichment, SearchNumThreads = searchNumThreads, IndexNumThreads = indexNumThreads, diff --git a/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs b/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs index 81d389c69..9d996c8ed 100644 --- a/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs +++ b/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs @@ -29,7 +29,6 @@ ICoreService githubActionsService /// Elasticsearch API key, alternatively set env DOCUMENTATION_ELASTIC_APIKEY /// Elasticsearch username (basic auth), alternatively set env DOCUMENTATION_ELASTIC_USERNAME /// Elasticsearch password (basic auth), alternatively set env DOCUMENTATION_ELASTIC_PASSWORD - /// Index without semantic fields /// Enable AI enrichment of documents using LLM-generated metadata /// The number of search threads the inference endpoint should use. Defaults: 8 /// The number of index threads the inference endpoint should use. Defaults: 8 @@ -57,7 +56,6 @@ public async Task Index(IDiagnosticsCollector collector, string? username = null, string? password = null, // inference options - bool? noSemantic = null, bool? enableAiEnrichment = null, int? searchNumThreads = null, int? indexNumThreads = null, @@ -88,7 +86,6 @@ public async Task Index(IDiagnosticsCollector collector, ApiKey = apiKey, Username = username, Password = password, - NoSemantic = noSemantic, EnableAiEnrichment = enableAiEnrichment, SearchNumThreads = searchNumThreads, IndexNumThreads = indexNumThreads, diff --git a/src/tooling/docs-builder/Commands/Assembler/AssemblerIndexCommand.cs b/src/tooling/docs-builder/Commands/Assembler/AssemblerIndexCommand.cs index 115dda5b9..ba3e8ebc9 100644 --- a/src/tooling/docs-builder/Commands/Assembler/AssemblerIndexCommand.cs +++ b/src/tooling/docs-builder/Commands/Assembler/AssemblerIndexCommand.cs @@ -30,7 +30,6 @@ ICoreService githubActionsService /// Elasticsearch API key, alternatively set env DOCUMENTATION_ELASTIC_APIKEY /// Elasticsearch username (basic auth), alternatively set env DOCUMENTATION_ELASTIC_USERNAME /// Elasticsearch password (basic auth), alternatively set env DOCUMENTATION_ELASTIC_PASSWORD - /// Index without semantic fields /// Enable AI enrichment of documents using LLM-generated metadata /// The number of search threads the inference endpoint should use. Defaults: 8 /// The number of index threads the inference endpoint should use. Defaults: 8 @@ -59,7 +58,6 @@ public async Task Index( string? password = null, // inference options - bool? noSemantic = null, bool? enableAiEnrichment = null, int? searchNumThreads = null, int? indexNumThreads = null, @@ -97,7 +95,7 @@ public async Task Index( // endpoint options endpoint, environment, apiKey, username, password, // inference options - noSemantic, enableAiEnrichment, indexNumThreads, searchNumThreads, noEis, bootstrapTimeout, + enableAiEnrichment, indexNumThreads, searchNumThreads, noEis, bootstrapTimeout, // channel and connection options indexNamePrefix, forceReindex, bufferSize, maxRetries, debugMode, // proxy options @@ -110,7 +108,7 @@ static async (s, collector, state, ctx) => await s.Index(collector, state.fs, // endpoint options state.endpoint, state.environment, state.apiKey, state.username, state.password, // inference options - state.noSemantic, state.enableAiEnrichment, state.searchNumThreads, state.indexNumThreads, state.noEis, state.bootstrapTimeout, + state.enableAiEnrichment, state.searchNumThreads, state.indexNumThreads, state.noEis, state.bootstrapTimeout, // channel and connection options state.indexNamePrefix, state.forceReindex, state.bufferSize, state.maxRetries, state.debugMode, // proxy options diff --git a/src/tooling/docs-builder/Commands/Codex/CodexIndexCommand.cs b/src/tooling/docs-builder/Commands/Codex/CodexIndexCommand.cs index 2af22611d..57ba9bdf2 100644 --- a/src/tooling/docs-builder/Commands/Codex/CodexIndexCommand.cs +++ b/src/tooling/docs-builder/Commands/Codex/CodexIndexCommand.cs @@ -35,7 +35,6 @@ ICoreService githubActionsService /// Elasticsearch API key, alternatively set env DOCUMENTATION_ELASTIC_APIKEY /// Elasticsearch username (basic auth), alternatively set env DOCUMENTATION_ELASTIC_USERNAME /// Elasticsearch password (basic auth), alternatively set env DOCUMENTATION_ELASTIC_PASSWORD - /// Index without semantic fields /// Enable AI enrichment of documents using LLM-generated metadata /// The number of search threads the inference endpoint should use. Defaults: 8 /// The number of index threads the inference endpoint should use. Defaults: 8 @@ -64,7 +63,6 @@ public async Task Index( string? password = null, // inference options - bool? noSemantic = null, bool? enableAiEnrichment = null, int? searchNumThreads = null, int? indexNumThreads = null, @@ -126,7 +124,6 @@ public async Task Index( ApiKey = apiKey, Username = username, Password = password, - NoSemantic = noSemantic, EnableAiEnrichment = enableAiEnrichment, SearchNumThreads = searchNumThreads, IndexNumThreads = indexNumThreads, diff --git a/src/tooling/docs-builder/Commands/IndexCommand.cs b/src/tooling/docs-builder/Commands/IndexCommand.cs index efc1af596..ad34f4eaf 100644 --- a/src/tooling/docs-builder/Commands/IndexCommand.cs +++ b/src/tooling/docs-builder/Commands/IndexCommand.cs @@ -28,7 +28,6 @@ ICoreService githubActionsService /// Elasticsearch API key, alternatively set env DOCUMENTATION_ELASTIC_APIKEY /// Elasticsearch username (basic auth), alternatively set env DOCUMENTATION_ELASTIC_USERNAME /// Elasticsearch password (basic auth), alternatively set env DOCUMENTATION_ELASTIC_PASSWORD - /// Index without semantic fields /// Enable AI enrichment of documents using LLM-generated metadata /// The number of search threads the inference endpoint should use. Defaults: 8 /// The number of index threads the inference endpoint should use. Defaults: 8 @@ -57,7 +56,6 @@ public async Task Index( string? password = null, // inference options - bool? noSemantic = null, bool? enableAiEnrichment = null, int? searchNumThreads = null, int? indexNumThreads = null, @@ -95,7 +93,7 @@ public async Task Index( // endpoint options endpoint, apiKey, username, password, // inference options - noSemantic, enableAiEnrichment, indexNumThreads, noEis, searchNumThreads, bootstrapTimeout, + enableAiEnrichment, indexNumThreads, noEis, searchNumThreads, bootstrapTimeout, // channel and connection options indexNamePrefix, forceReindex, bufferSize, maxRetries, debugMode, // proxy options @@ -108,7 +106,7 @@ static async (s, collector, state, ctx) => await s.Index(collector, state.fs, st // endpoint options state.endpoint, state.apiKey, state.username, state.password, // inference options - state.noSemantic, state.enableAiEnrichment, state.searchNumThreads, state.indexNumThreads, state.noEis, state.bootstrapTimeout, + state.enableAiEnrichment, state.searchNumThreads, state.indexNumThreads, state.noEis, state.bootstrapTimeout, // channel and connection options state.indexNamePrefix, state.forceReindex, state.bufferSize, state.maxRetries, state.debugMode, // proxy options From 8f485d1a2aa1fdd7325990e6975b333f7d9dea55 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Sun, 22 Feb 2026 18:22:08 +0100 Subject: [PATCH 07/15] Add Jina v5 dense embeddings alongside ELSER sparse embeddings Add .jina-embeddings-v5-text-small inference on 6 fields (title, abstract, ai_rag_optimized_summary, ai_questions, ai_use_cases, stripped_body) to enable hybrid sparse+dense retrieval. Rename InferenceId to ElserInferenceId for clarity. --- .../Search/DocumentationMappingConfig.cs | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs index ec5bb391c..d3fc5e706 100644 --- a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs +++ b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs @@ -88,17 +88,31 @@ internal static DocumentationDocumentMappingsBuilder ConfigureCommonMappings(Doc public static class SemanticConfig { - private const string InferenceId = ".elser-2-elastic"; + private const string ElserInferenceId = ".elser-2-elastic"; + private const string JinaInferenceId = ".jina-embeddings-v5-text-small"; public static AnalysisBuilder ConfigureAnalysis(AnalysisBuilder analysis) => analysis; public static DocumentationDocumentMappingsBuilder ConfigureMappings(DocumentationDocumentMappingsBuilder m) => LexicalConfig.ConfigureCommonMappings(m) - .AddField("title.semantic_text", f => f.SemanticText().InferenceId(InferenceId)) - .AddField("abstract.semantic_text", f => f.SemanticText().InferenceId(InferenceId)) - .AddField("ai_rag_optimized_summary.semantic_text", f => f.SemanticText().InferenceId(InferenceId)) - .AddField("ai_questions.semantic_text", f => f.SemanticText().InferenceId(InferenceId)) - .AddField("ai_use_cases.semantic_text", f => f.SemanticText().InferenceId(InferenceId)); + .StrippedBody(s => s + .Analyzer("synonyms_fixed_analyzer") + .SearchAnalyzer("synonyms_analyzer") + .MultiField("jina", mf => mf.Text().Analyzer(JinaInferenceId)) + ) + // ELSER sparse embeddings + .AddField("title.semantic_text", f => f.SemanticText().InferenceId(ElserInferenceId)) + .AddField("abstract.semantic_text", f => f.SemanticText().InferenceId(ElserInferenceId)) + .AddField("ai_rag_optimized_summary.semantic_text", f => f.SemanticText().InferenceId(ElserInferenceId)) + .AddField("ai_questions.semantic_text", f => f.SemanticText().InferenceId(ElserInferenceId)) + .AddField("ai_use_cases.semantic_text", f => f.SemanticText().InferenceId(ElserInferenceId)) + // Jina v5 dense embeddings + .AddField("title.jina", f => f.SemanticText().InferenceId(JinaInferenceId)) + .AddField("abstract.jina", f => f.SemanticText().InferenceId(JinaInferenceId)) + .AddField("ai_rag_optimized_summary.jina", f => f.SemanticText().InferenceId(JinaInferenceId)) + .AddField("ai_questions.jina", f => f.SemanticText().InferenceId(JinaInferenceId)) + .AddField("ai_use_cases.jina", f => f.SemanticText().InferenceId(JinaInferenceId)) + .AddField("stripped_body.jina", f => f.SemanticText().InferenceId(JinaInferenceId)); } /// From d8f4a32a00363f5d864600798220a77409e1b815 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Sun, 22 Feb 2026 19:03:17 +0100 Subject: [PATCH 08/15] fix import ordering --- src/api/Elastic.Documentation.Mcp.Remote/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs index 23d5495b8..129821347 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs +++ b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs @@ -3,9 +3,9 @@ // See the LICENSE file in the project root for more information using Elastic.Documentation.Api.Infrastructure.OpenTelemetry; -using Elastic.Documentation.Configuration; using Elastic.Documentation.Assembler.Links; using Elastic.Documentation.Assembler.Mcp; +using Elastic.Documentation.Configuration; using Elastic.Documentation.LinkIndex; using Elastic.Documentation.Links.InboundLinks; using Elastic.Documentation.Mcp.Remote; From 6d5802f428867efd1600e7c4bcf50774f3ccaa06 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Mon, 23 Feb 2026 11:00:51 +0100 Subject: [PATCH 09/15] Bump ingest libraries --- Directory.Packages.props | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index ff43854b8..093b318be 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -48,8 +48,8 @@ - - + + From a1a4ea04d2f69cd80835c4c1ba206206d4d2bfbb Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Tue, 24 Feb 2026 09:20:53 +0100 Subject: [PATCH 10/15] Update Elastic.Ingest.Elasticsearch and Elastic.Mapping to 0.27.0 Use source-generated IStaticMappingResolver delegates for auto-stamping BatchIndexDate and LastUpdated instead of manual assignment. Replace DocumentationAnalysisFactory.CreateContext with direct context customization via WithIndexName() and record-with expressions. Pass IndexSettings for default_pipeline conditionally at runtime. --- Directory.Packages.props | 4 +- .../Search/DocumentationDocument.cs | 2 + .../Search/DocumentationMappingConfig.cs | 40 ++--------- .../ElasticsearchMarkdownExporter.Export.cs | 2 - .../ElasticsearchMarkdownExporter.cs | 67 +++++++++++-------- 5 files changed, 49 insertions(+), 66 deletions(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index 093b318be..20f8117f1 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -48,8 +48,8 @@ - - + + diff --git a/src/Elastic.Documentation/Search/DocumentationDocument.cs b/src/Elastic.Documentation/Search/DocumentationDocument.cs index e25ded0ab..bfbaace5d 100644 --- a/src/Elastic.Documentation/Search/DocumentationDocument.cs +++ b/src/Elastic.Documentation/Search/DocumentationDocument.cs @@ -73,10 +73,12 @@ public record DocumentationDocument /// The date of the batch update this document was part of last. /// This date could be higher than the date_last_updated. + [BatchIndexDate] [JsonPropertyName("batch_index_date")] public DateTimeOffset BatchIndexDate { get; set; } /// The date this document was last updated, + [LastUpdated] [Timestamp] [JsonPropertyName("last_updated")] public DateTimeOffset LastUpdated { get; set; } diff --git a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs index d3fc5e706..e07f0cbf0 100644 --- a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs +++ b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs @@ -34,7 +34,11 @@ public static class LexicalConfig public static AnalysisBuilder ConfigureAnalysis(AnalysisBuilder analysis) => analysis; public static DocumentationDocumentMappingsBuilder ConfigureMappings(DocumentationDocumentMappingsBuilder m) => - ConfigureCommonMappings(m); + ConfigureCommonMappings(m) + .StrippedBody(f => f + .Analyzer("synonyms_fixed_analyzer") + .SearchAnalyzer("synonyms_analyzer") + ); internal static DocumentationDocumentMappingsBuilder ConfigureCommonMappings(DocumentationDocumentMappingsBuilder m) => m // Text fields with custom analyzers and multi-fields @@ -51,9 +55,6 @@ internal static DocumentationDocumentMappingsBuilder ConfigureCommonMappings(Doc .Analyzer("starts_with_analyzer") .SearchAnalyzer("starts_with_analyzer_search")) .MultiField("completion", mf => mf.SearchAsYouType().SearchAnalyzer("synonyms_analyzer"))) - .StrippedBody(f => f - .Analyzer("synonyms_fixed_analyzer") - .SearchAnalyzer("synonyms_analyzer")) .Abstract(f => f .Analyzer("synonyms_fixed_analyzer") .SearchAnalyzer("synonyms_analyzer")) @@ -98,7 +99,6 @@ public static DocumentationDocumentMappingsBuilder ConfigureMappings(Documentati .StrippedBody(s => s .Analyzer("synonyms_fixed_analyzer") .SearchAnalyzer("synonyms_analyzer") - .MultiField("jina", mf => mf.Text().Analyzer(JinaInferenceId)) ) // ELSER sparse embeddings .AddField("title.semantic_text", f => f.SemanticText().InferenceId(ElserInferenceId)) @@ -111,8 +111,7 @@ public static DocumentationDocumentMappingsBuilder ConfigureMappings(Documentati .AddField("abstract.jina", f => f.SemanticText().InferenceId(JinaInferenceId)) .AddField("ai_rag_optimized_summary.jina", f => f.SemanticText().InferenceId(JinaInferenceId)) .AddField("ai_questions.jina", f => f.SemanticText().InferenceId(JinaInferenceId)) - .AddField("ai_use_cases.jina", f => f.SemanticText().InferenceId(JinaInferenceId)) - .AddField("stripped_body.jina", f => f.SemanticText().InferenceId(JinaInferenceId)); + .AddField("ai_use_cases.jina", f => f.SemanticText().InferenceId(JinaInferenceId)); } /// @@ -160,31 +159,4 @@ public static AnalysisBuilder BuildAnalysis(AnalysisBuilder analysis, string syn .Tokenizer("path_tokenizer", t => t.PathHierarchy() .Delimiter('/')); - /// - /// Creates an ElasticsearchTypeContext with runtime analysis settings and dynamic index name. - /// Analysis is provided via , which - /// Elastic.Ingest.Elasticsearch merges into the settings automatically. - /// - public static ElasticsearchTypeContext CreateContext( - ElasticsearchTypeContext baseContext, - string indexName, - string synonymSetName, - string[] indexTimeSynonyms, - string? defaultPipeline = null) - { - var analysisJson = BuildAnalysis(new AnalysisBuilder(), synonymSetName, indexTimeSynonyms).Build().ToJsonString(); - var settingsHash = ContentHash.Create(analysisJson, defaultPipeline ?? ""); - var hash = ContentHash.Create(settingsHash, baseContext.MappingsHash); - - return baseContext.WithIndexName(indexName) with - { - GetSettingsJson = defaultPipeline is not null - ? () => $$"""{ "default_pipeline": "{{defaultPipeline}}" }""" - : () => "{}", - SettingsHash = settingsHash, - Hash = hash, - ConfigureAnalysis = a => BuildAnalysis(a, synonymSetName, indexTimeSynonyms), - IndexPatternUseBatchDate = true - }; - } } diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs index a4c2172d8..e69dfc9e5 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs @@ -37,8 +37,6 @@ private void AssignDocumentMetadata(DocumentationDocument doc) _fixedSynonymsHash ); doc.Hash = hash; - doc.LastUpdated = _batchIndexDate; - doc.BatchIndexDate = _batchIndexDate; } private static void CommonEnrichments(DocumentationDocument doc, INavigationItem? navigationItem) diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs index 300a1a6bb..965ebd179 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs @@ -28,7 +28,6 @@ public partial class ElasticsearchMarkdownExporter : IMarkdownExporter, IDisposa private readonly ElasticsearchEndpoint _endpoint; private readonly DistributedTransport _transport; private readonly string _indexNamespace; - private readonly DateTimeOffset _batchIndexDate; // Ingest: orchestrator for dual-index mode private readonly IncrementalSyncOrchestrator _orchestrator; @@ -89,10 +88,16 @@ IDocumentationConfigurationContext context var lexicalPrefix = es.IndexNamePrefix.Replace("semantic", "lexical").ToLowerInvariant(); var lexicalAlias = $"{lexicalPrefix}-{ns}"; - _lexicalTypeContext = DocumentationAnalysisFactory.CreateContext( - DocumentationMappingContext.DocumentationDocument.Context, - lexicalAlias, synonymSetName, indexTimeSynonyms, aiPipeline - ); + var pipelineSettings = aiPipeline is not null + ? new Dictionary { ["index.default_pipeline"] = aiPipeline } + : null; + + _lexicalTypeContext = DocumentationMappingContext.DocumentationDocument.Context + .WithIndexName(lexicalAlias) with + { + ConfigureAnalysis = a => DocumentationAnalysisFactory.BuildAnalysis(a, synonymSetName, indexTimeSynonyms), + IndexSettings = pipelineSettings + }; // Initialize AI enrichment services if enabled if (es.EnableAiEnrichment) @@ -103,17 +108,23 @@ IDocumentationConfigurationContext context } var semanticAlias = $"{es.IndexNamePrefix.ToLowerInvariant()}-{ns}"; - _semanticTypeContext = DocumentationAnalysisFactory.CreateContext( - DocumentationMappingContext.DocumentationDocumentSemantic.Context, - semanticAlias, synonymSetName, indexTimeSynonyms, aiPipeline - ); + _semanticTypeContext = DocumentationMappingContext.DocumentationDocumentSemantic.Context + .WithIndexName(semanticAlias) with + { + ConfigureAnalysis = a => DocumentationAnalysisFactory.BuildAnalysis(a, synonymSetName, indexTimeSynonyms), + IndexSettings = pipelineSettings + }; - _orchestrator = new IncrementalSyncOrchestrator(_transport, _lexicalTypeContext, _semanticTypeContext) + var resolver = DocumentationMappingContext.DocumentationDocument; + _orchestrator = new IncrementalSyncOrchestrator( + _transport, _lexicalTypeContext, _semanticTypeContext, + setBatchIndexDate: resolver.SetBatchIndexDate, + setLastUpdated: resolver.SetLastUpdated) { ConfigurePrimary = ConfigureChannelOptions, ConfigureSecondary = ConfigureChannelOptions, OnPostComplete = es.EnableAiEnrichment - ? async (ctx, ct) => await PostCompleteAsync(ctx, ct) + ? async (ctx, _, ct) => await PostCompleteAsync(ctx, ct) : null }; _ = _orchestrator.AddPreBootstrapTask(async (_, ct) => @@ -122,8 +133,6 @@ IDocumentationConfigurationContext context await PublishSynonymsAsync(ct); await PublishQueryRulesAsync(ct); }); - - _batchIndexDate = _orchestrator.BatchTimestamp; } private void ConfigureChannelOptions(IngestChannelOptions options) @@ -209,19 +218,19 @@ private async ValueTask BackfillMissingAiFieldsAsync(string semanticAlias, Cance _enrichmentCache.Count, currentPromptHash[..8]); var query = $$""" - { - "query": { - "bool": { - "must": { "exists": { "field": "enrichment_key" } }, - "should": [ - { "bool": { "must_not": { "exists": { "field": "ai_questions" } } } }, - { "bool": { "must_not": { "term": { "enrichment_prompt_hash": "{{currentPromptHash}}" } } } } - ], - "minimum_should_match": 1 - } - } - } - """; + { + "query": { + "bool": { + "must": { "exists": { "field": "enrichment_key" } }, + "should": [ + { "bool": { "must_not": { "exists": { "field": "ai_questions" } } } }, + { "bool": { "must_not": { "term": { "enrichment_prompt_hash": "{{currentPromptHash}}" } } } } + ], + "minimum_should_match": 1 + } + } + } + """; await _operations.UpdateByQueryAsync(semanticAlias, PostData.String(query), EnrichPolicyManager.PipelineName, ctx); } @@ -252,7 +261,8 @@ private async Task PutSynonyms(SynonymsSet synonymsSet, string setName, Cancel c ctx); if (!response.ApiCallDetails.HasSuccessfulStatusCode) - _collector.EmitGlobalError($"Failed to publish synonym set '{setName}'. Reason: {response.ApiCallDetails.OriginalException?.Message ?? response.ToString()}"); + _collector.EmitGlobalError( + $"Failed to publish synonym set '{setName}'. Reason: {response.ApiCallDetails.OriginalException?.Message ?? response.ToString()}"); else _logger.LogInformation("Successfully published synonym set '{SetName}'.", setName); } @@ -295,7 +305,8 @@ private async Task PutQueryRuleset(QueryRuleset ruleset, string rulesetName, Can ctx); if (!response.ApiCallDetails.HasSuccessfulStatusCode) - _collector.EmitGlobalError($"Failed to publish query ruleset '{rulesetName}'. Reason: {response.ApiCallDetails.OriginalException?.Message ?? response.ToString()}"); + _collector.EmitGlobalError( + $"Failed to publish query ruleset '{rulesetName}'. Reason: {response.ApiCallDetails.OriginalException?.Message ?? response.ToString()}"); else _logger.LogInformation("Successfully published query ruleset '{RulesetName}'.", rulesetName); } From 7f578324ec6fa4b699a7faa1b409cb19c6b8e5a7 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Tue, 24 Feb 2026 18:06:32 +0100 Subject: [PATCH 11/15] Fix template parameter semantics: {type} = build type, {env} = environment Rename indexNamespace to buildType throughout the exporter pipeline so callers pass the build type (assembler, isolated, codex) instead of the environment name. Search services now hardcode "assembler" as the type since they always target assembler indices. ResolveNamespace renamed to ResolveEnvironment and updated to parse the old production index format ({variant}-docs-{env}-{timestamp}) to extract the environment name. --- .../Building/CodexBuildService.cs | 2 +- .../ElasticsearchEndpointFactory.cs | 30 +++++++++++-------- .../Search/DocumentationMappingConfig.cs | 2 +- .../ElasticsearchMarkdownExporter.cs | 22 +++++--------- .../Exporters/ExporterExtensions.cs | 4 +-- .../Elastic.Documentation.Api.App/Program.cs | 5 +++- .../Program.cs | 4 ++- .../Building/AssemblerBuildService.cs | 2 +- .../Building/AssemblerBuilder.cs | 4 +-- .../Common/ElasticsearchClientAccessor.cs | 10 +++++-- .../Search/SearchBootstrapFixture.cs | 10 +++---- 11 files changed, 50 insertions(+), 45 deletions(-) diff --git a/src/Elastic.Codex/Building/CodexBuildService.cs b/src/Elastic.Codex/Building/CodexBuildService.cs index 8db6350d2..a502ff52c 100644 --- a/src/Elastic.Codex/Building/CodexBuildService.cs +++ b/src/Elastic.Codex/Building/CodexBuildService.cs @@ -85,7 +85,7 @@ public async Task BuildAll( if (exporters is not null && buildContexts.Count > 0) { var firstContext = buildContexts[0].BuildContext; - sharedExporters = exporters.CreateMarkdownExporters(logFactory, firstContext, context.IndexNamespace).ToArray(); + sharedExporters = exporters.CreateMarkdownExporters(logFactory, firstContext, "codex").ToArray(); var startTasks = sharedExporters.Select(async e => await e.StartAsync(ctx)); await Task.WhenAll(startTasks); } diff --git a/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs b/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs index d5e6cd6ac..86def5bbc 100644 --- a/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs +++ b/src/Elastic.Documentation.ServiceDefaults/ElasticsearchEndpointFactory.cs @@ -56,39 +56,43 @@ public static DocumentationEndpoints Create(IConfiguration? appConfiguration = n Username = username }; - var ns = ResolveNamespace(config, appConfiguration, endpoint.IndexNamePrefix); + var ns = ResolveEnvironment(config, appConfiguration); return new DocumentationEndpoints { Elasticsearch = endpoint, Namespace = ns }; } /// - /// Resolves the deployment namespace using this priority: - /// 1. DOCUMENTATION_ELASTIC_INDEX env var — strip prefix and -latest suffix + /// Resolves the environment name using this priority: + /// 1. DOCUMENTATION_ELASTIC_INDEX env var — parse old format {variant}-docs-{env}-{timestamp} /// 2. DOTNET_ENVIRONMENT env var /// 3. ENVIRONMENT env var /// 4. Fallback: "dev" /// - private static string ResolveNamespace(IConfiguration config, IConfiguration? appConfiguration, string indexNamePrefix) + private static string ResolveEnvironment(IConfiguration config, IConfiguration? appConfiguration) { var indexName = appConfiguration?["DOCUMENTATION_ELASTIC_INDEX"] ?? config["DOCUMENTATION_ELASTIC_INDEX"]; if (!string.IsNullOrEmpty(indexName)) { - var prefix = $"{indexNamePrefix}-"; - const string suffix = "-latest"; - if (indexName.StartsWith(prefix, StringComparison.OrdinalIgnoreCase) && - indexName.EndsWith(suffix, StringComparison.OrdinalIgnoreCase)) + // Old production format: {variant}-docs-{env}-{timestamp} + // e.g. "lexical-docs-edge-2025.10.23.120521" + // Extract the environment segment after "docs-" and before the next "-" followed by digits. + const string marker = "-docs-"; + var markerIndex = indexName.IndexOf(marker, StringComparison.OrdinalIgnoreCase); + if (markerIndex >= 0) { - var ns = indexName[prefix.Length..^suffix.Length]; - if (!string.IsNullOrEmpty(ns)) - return ns; + var afterMarker = indexName[(markerIndex + marker.Length)..]; + var dashIndex = afterMarker.IndexOf('-'); + var env = dashIndex > 0 ? afterMarker[..dashIndex] : afterMarker; + if (!string.IsNullOrEmpty(env) && (dashIndex < 0 || char.IsDigit(afterMarker[dashIndex + 1]))) + return env.ToLowerInvariant(); } } - var env = config["DOTNET_ENVIRONMENT"] + var envVar = config["DOTNET_ENVIRONMENT"] ?? config["ENVIRONMENT"]; - return !string.IsNullOrEmpty(env) ? env.ToLowerInvariant() : "dev"; + return !string.IsNullOrEmpty(envVar) ? envVar.ToLowerInvariant() : "dev"; } } diff --git a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs index e07f0cbf0..f5101dec4 100644 --- a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs +++ b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs @@ -29,7 +29,7 @@ namespace Elastic.Documentation.Search; )] public static partial class DocumentationMappingContext; -public static class LexicalConfig +public static class LexicalConfig : IConfigureElasticsearch { public static AnalysisBuilder ConfigureAnalysis(AnalysisBuilder analysis) => analysis; diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs index 965ebd179..6a6a510dd 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs @@ -27,7 +27,7 @@ public partial class ElasticsearchMarkdownExporter : IMarkdownExporter, IDisposa private readonly ILogger _logger; private readonly ElasticsearchEndpoint _endpoint; private readonly DistributedTransport _transport; - private readonly string _indexNamespace; + private readonly string _buildType; // Ingest: orchestrator for dual-index mode private readonly IncrementalSyncOrchestrator _orchestrator; @@ -56,7 +56,7 @@ public ElasticsearchMarkdownExporter( ILoggerFactory logFactory, IDiagnosticsCollector collector, DocumentationEndpoints endpoints, - string indexNamespace, + string buildType, IDocumentationConfigurationContext context ) { @@ -64,7 +64,7 @@ IDocumentationConfigurationContext context _context = context; _logger = logFactory.CreateLogger(); _endpoint = endpoints.Elasticsearch; - _indexNamespace = indexNamespace; + _buildType = buildType; _versionsConfiguration = context.VersionsConfiguration; _synonyms = context.SearchConfiguration.Synonyms; _rules = context.SearchConfiguration.Rules; @@ -83,17 +83,13 @@ IDocumentationConfigurationContext context _fixedSynonymsHash = HashedBulkUpdate.CreateHash(string.Join(",", indexTimeSynonyms)); var aiPipeline = es.EnableAiEnrichment ? EnrichPolicyManager.PipelineName : null; - var synonymSetName = $"docs-{indexNamespace}"; - var ns = indexNamespace.ToLowerInvariant(); - var lexicalPrefix = es.IndexNamePrefix.Replace("semantic", "lexical").ToLowerInvariant(); - var lexicalAlias = $"{lexicalPrefix}-{ns}"; + var synonymSetName = $"docs-{buildType}"; var pipelineSettings = aiPipeline is not null ? new Dictionary { ["index.default_pipeline"] = aiPipeline } : null; - _lexicalTypeContext = DocumentationMappingContext.DocumentationDocument.Context - .WithIndexName(lexicalAlias) with + _lexicalTypeContext = DocumentationMappingContext.DocumentationDocument.CreateContext(type: buildType) with { ConfigureAnalysis = a => DocumentationAnalysisFactory.BuildAnalysis(a, synonymSetName, indexTimeSynonyms), IndexSettings = pipelineSettings @@ -107,9 +103,7 @@ IDocumentationConfigurationContext context _enrichPolicyManager = new EnrichPolicyManager(_transport, logFactory.CreateLogger(), _enrichmentCache.IndexName); } - var semanticAlias = $"{es.IndexNamePrefix.ToLowerInvariant()}-{ns}"; - _semanticTypeContext = DocumentationMappingContext.DocumentationDocumentSemantic.Context - .WithIndexName(semanticAlias) with + _semanticTypeContext = DocumentationMappingContext.DocumentationDocumentSemantic.CreateContext(type: buildType) with { ConfigureAnalysis = a => DocumentationAnalysisFactory.BuildAnalysis(a, synonymSetName, indexTimeSynonyms), IndexSettings = pipelineSettings @@ -237,7 +231,7 @@ private async ValueTask BackfillMissingAiFieldsAsync(string semanticAlias, Cance private async Task PublishSynonymsAsync(Cancel ctx) { - var setName = $"docs-{_indexNamespace}"; + var setName = $"docs-{_buildType}"; _logger.LogInformation("Publishing synonym set '{SetName}' to Elasticsearch", setName); var synonymRules = _synonyms.Aggregate(new List(), (acc, synonym) => @@ -275,7 +269,7 @@ private async Task PublishQueryRulesAsync(Cancel ctx) return; } - var rulesetName = $"docs-ruleset-{_indexNamespace}"; + var rulesetName = $"docs-ruleset-{_buildType}"; _logger.LogInformation("Publishing query ruleset '{RulesetName}' with {Count} rules to Elasticsearch", rulesetName, _rules.Count); var rulesetRules = _rules.Select(r => new QueryRulesetRule diff --git a/src/Elastic.Markdown/Exporters/ExporterExtensions.cs b/src/Elastic.Markdown/Exporters/ExporterExtensions.cs index cec7388f3..6deb2a8c0 100644 --- a/src/Elastic.Markdown/Exporters/ExporterExtensions.cs +++ b/src/Elastic.Markdown/Exporters/ExporterExtensions.cs @@ -15,7 +15,7 @@ public static IReadOnlyCollection CreateMarkdownExporters( this IReadOnlySet exportOptions, ILoggerFactory logFactory, IDocumentationConfigurationContext context, - string indexNamespace + string buildType ) { var markdownExporters = new List(4); @@ -24,7 +24,7 @@ string indexNamespace if (exportOptions.Contains(Exporter.Configuration)) markdownExporters.Add(new ConfigurationExporter(logFactory, context.ConfigurationFileProvider, context)); if (exportOptions.Contains(Exporter.Elasticsearch)) - markdownExporters.Add(new ElasticsearchMarkdownExporter(logFactory, context.Collector, context.Endpoints, indexNamespace, context)); + markdownExporters.Add(new ElasticsearchMarkdownExporter(logFactory, context.Collector, context.Endpoints, buildType, context)); return markdownExporters; } } diff --git a/src/api/Elastic.Documentation.Api.App/Program.cs b/src/api/Elastic.Documentation.Api.App/Program.cs index 9ee9ef56d..ff255c280 100644 --- a/src/api/Elastic.Documentation.Api.App/Program.cs +++ b/src/api/Elastic.Documentation.Api.App/Program.cs @@ -6,6 +6,7 @@ using Elastic.Documentation.Api.Infrastructure.OpenTelemetry; using Elastic.Documentation.Configuration; using Elastic.Documentation.Configuration.Assembler; +using Elastic.Documentation.Search; using Elastic.Documentation.ServiceDefaults; using Microsoft.AspNetCore.Diagnostics; using Microsoft.AspNetCore.Diagnostics.HealthChecks; @@ -86,7 +87,9 @@ static void LogElasticsearchConfiguration(WebApplication app, ILogger logger) if (endpoints is not null) { var endpoint = endpoints.Elasticsearch; - var searchIndex = $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-{endpoints.Namespace}-latest"; + var searchIndex = DocumentationMappingContext.DocumentationDocumentSemantic + .CreateContext(type: "assembler") + .ResolveReadTarget(); logger.LogInformation( "Elasticsearch configuration - Url: {Url}, Namespace: {Namespace}, SearchIndex: {SearchIndex}", endpoint.Uri, diff --git a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs index 129821347..41615c7c1 100644 --- a/src/api/Elastic.Documentation.Mcp.Remote/Program.cs +++ b/src/api/Elastic.Documentation.Mcp.Remote/Program.cs @@ -146,7 +146,9 @@ static void LogElasticsearchConfiguration(WebApplication app, ILogger logger) if (endpoints is not null) { var endpoint = endpoints.Elasticsearch; - var searchIndex = $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-{endpoints.Namespace}-latest"; + var searchIndex = DocumentationMappingContext.DocumentationDocumentSemantic + .CreateContext(type: "assembler") + .ResolveReadTarget(); logger.LogInformation( "Elasticsearch configuration - Url: {Url}, Namespace: {Namespace}, SearchIndex: {SearchIndex}", endpoint.Uri, diff --git a/src/services/Elastic.Documentation.Assembler/Building/AssemblerBuildService.cs b/src/services/Elastic.Documentation.Assembler/Building/AssemblerBuildService.cs index bd0a6e760..941aa47c8 100644 --- a/src/services/Elastic.Documentation.Assembler/Building/AssemblerBuildService.cs +++ b/src/services/Elastic.Documentation.Assembler/Building/AssemblerBuildService.cs @@ -109,7 +109,7 @@ Cancel ctx var builder = new AssemblerBuilder(logFactory, assembleContext, navigation, htmlWriter, pathProvider, historyMapper); - await builder.BuildAllAsync(assembleContext.Environment, assembleSources.AssembleSets, exporters, ctx); + await builder.BuildAllAsync(assembleSources.AssembleSets, exporters, ctx); if (exporters.Contains(Exporter.LinkMetadata)) await cloner.WriteLinkRegistrySnapshot(checkoutResult.LinkRegistrySnapshot, ctx); diff --git a/src/services/Elastic.Documentation.Assembler/Building/AssemblerBuilder.cs b/src/services/Elastic.Documentation.Assembler/Building/AssemblerBuilder.cs index e60954294..74b037059 100644 --- a/src/services/Elastic.Documentation.Assembler/Building/AssemblerBuilder.cs +++ b/src/services/Elastic.Documentation.Assembler/Building/AssemblerBuilder.cs @@ -38,7 +38,7 @@ public class AssemblerBuilder( private ILegacyUrlMapper? LegacyUrlMapper { get; } = legacyUrlMapper; - public async Task BuildAllAsync(PublishEnvironment environment, FrozenDictionary assembleSets, IReadOnlySet exportOptions, Cancel ctx) + public async Task BuildAllAsync(FrozenDictionary assembleSets, IReadOnlySet exportOptions, Cancel ctx) { if (context.OutputDirectory.Exists) context.OutputDirectory.Delete(true); @@ -48,7 +48,7 @@ public async Task BuildAllAsync(PublishEnvironment environment, FrozenDictionary var buildTimes = new List<(string Name, int FileCount, TimeSpan Duration)>(); // Create exporters without inferrer - inferrer is created per-repository - var markdownExporters = exportOptions.CreateMarkdownExporters(logFactory, context, environment.Name); + var markdownExporters = exportOptions.CreateMarkdownExporters(logFactory, context, "assembler"); var tasks = markdownExporters.Select(async e => await e.StartAsync(ctx)); await Task.WhenAll(tasks); diff --git a/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs b/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs index 9ee24d757..7c26b7843 100644 --- a/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs +++ b/src/services/Elastic.Documentation.Search/Common/ElasticsearchClientAccessor.cs @@ -6,6 +6,7 @@ using Elastic.Clients.Elasticsearch.Serialization; using Elastic.Documentation.Configuration; using Elastic.Documentation.Configuration.Search; +using Elastic.Documentation.Search; using Elastic.Transport; namespace Elastic.Documentation.Search.Common; @@ -28,7 +29,8 @@ public class ElasticsearchClientAccessor : IDisposable public ElasticsearchClientAccessor( DocumentationEndpoints endpoints, - SearchConfiguration searchConfiguration) + SearchConfiguration searchConfiguration + ) { var endpoint = endpoints.Elasticsearch; Endpoint = endpoint; @@ -36,10 +38,12 @@ public ElasticsearchClientAccessor( SynonymBiDirectional = searchConfiguration.SynonymBiDirectional; DiminishTerms = searchConfiguration.DiminishTerms; - SearchIndex = $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-{endpoints.Namespace}-latest"; + SearchIndex = DocumentationMappingContext.DocumentationDocumentSemantic + .CreateContext(type: "assembler") + .ResolveReadTarget(); RulesetName = searchConfiguration.Rules.Count > 0 - ? $"docs-ruleset-{endpoints.Namespace}" + ? "docs-ruleset-assembler" : null; _nodePool = new SingleNodePool(endpoint.Uri); diff --git a/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs b/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs index c7f0db5a8..cb9411ec5 100644 --- a/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs +++ b/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs @@ -152,12 +152,10 @@ private async ValueTask IsIndexingNeeded() var collector = new ConsoleDiagnosticsCollector(loggerFactory); // Create semantic type context to check channel hash (index namespace is 'dev' for tests) - var semanticTypeContext = DocumentationAnalysisFactory.CreateContext( - DocumentationMappingContext.DocumentationDocumentSemantic.Context, - $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-dev", - "docs-dev", - [] - ); + var semanticTypeContext = DocumentationMappingContext.DocumentationDocumentSemantic.CreateContext(type: "assembler") with + { + ConfigureAnalysis = a => DocumentationAnalysisFactory.BuildAnalysis(a, "docs-assembler", []) + }; var options = new IngestChannelOptions(transport, semanticTypeContext); using var channel = new IngestChannel(options); From b190fd84941133f2293655bf4892f6cf186c8420 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Tue, 24 Feb 2026 18:14:58 +0100 Subject: [PATCH 12/15] Remove `IndexNamePrefix` across configuration, services, and commands to simplify index naming logic. Update Elasticsearch dependencies to version 0.28.0. --- Directory.Packages.props | 4 +- PLAN-rules-config.md | 194 ------------------ .../DocumentationEndpoints.cs | 3 - .../ElasticsearchEndpointConfigurator.cs | 3 - .../Search/DocumentationMappingConfig.cs | 36 ++-- .../Indexing/AssemblerIndexService.cs | 3 - .../IsolatedIndexService.cs | 3 - .../Assembler/AssemblerIndexCommand.cs | 6 +- .../Commands/Codex/CodexIndexCommand.cs | 3 - .../docs-builder/Commands/IndexCommand.cs | 6 +- 10 files changed, 18 insertions(+), 243 deletions(-) delete mode 100644 PLAN-rules-config.md diff --git a/Directory.Packages.props b/Directory.Packages.props index 20f8117f1..5ca9831a0 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -48,8 +48,8 @@ - - + + diff --git a/PLAN-rules-config.md b/PLAN-rules-config.md deleted file mode 100644 index 94fc33bbd..000000000 --- a/PLAN-rules-config.md +++ /dev/null @@ -1,194 +0,0 @@ -# Improved Rules Configuration Format - -## Context - -The `block` section in `changelog.yml` is being redesigned and renamed to `rules:`. Goals: -1. Explicit matching semantics (`any` vs `all`) -2. Per-field include/exclude modes for types and areas -3. Product overrides nested under the section they affect -4. Clear, scannable log messages prefixed with `[+include]` / `[-exclude]` -5. No backward compat — error if old `block:` key is seen - -## YAML Format - -```yaml -rules: - # Global match default for multi-valued fields (labels, areas). - # any (default) = match if ANY item matches the list - # all = match only if ALL items match the list - # Inherited by create, publish, and all product overrides. - # match: any - - # Create — controls which PRs generate changelog entries. - # exclude: block PRs with these labels (comma-separated) - # include: only create changelogs for PRs with these labels - # Cannot specify both. - # - # create: - # exclude: ">non-issue, >test" - # # match: any - # products: - # 'elasticsearch, kibana': - # exclude: ">test" - # 'cloud-serverless': - # exclude: "ILM" - - # Publish — controls which entries appear in rendered output. - # exclude_types / include_types - # exclude_areas / include_areas - # Cannot mix exclude_ and include_ for the same field. - # - # match_areas inherits from rules.match if not specified. - # - # publish: - # # match_areas: any - # exclude_types: - # - deprecation - # - known-issue - # exclude_areas: - # - "Internal" - # products: - # 'elasticsearch, kibana': - # exclude_types: - # - docs - # 'cloud-serverless': - # # match_areas: any - # include_areas: - # - "Search" - # - "Monitoring" -``` - -### Match inheritance - -``` -rules.match (global default, "any" if omitted) - ├─ create.match → create.products.{id}.match - └─ publish.match_areas → publish.products.{id}.match_areas -``` - -### Area matching examples - -| Config | Entry areas: `["Search", "Internal"]` | Result | -|--------|--------------------------------------|--------| -| `exclude_areas: [Internal]`, match `any` | "Internal" matches | **Blocked** | -| `exclude_areas: [Internal]`, match `all` | Not all match | **Allowed** | -| `include_areas: [Search]`, match `any` | "Search" matches | **Allowed** | -| `include_areas: [Search]`, match `all` | "Internal" not in list | **Blocked** | - -## Error Messages - -### Validation (config parsing) - -| Condition | Message | -|-----------|---------| -| Old `block:` key found | `'block' is no longer supported. Rename to 'rules'. See changelog.example.yml.` | -| Both `exclude_types` + `include_types` | `rules.publish: cannot have both 'exclude_types' and 'include_types'. Use one or the other.` | -| Both `exclude_areas` + `include_areas` | Same pattern | -| Both `create.exclude` + `create.include` | `rules.create: cannot have both 'exclude' and 'include'. Use one or the other.` | -| Invalid match value | `rules.match: '{value}' is not valid. Use 'any' or 'all'.` | -| Empty list | `rules.publish.exclude_types: list is empty. Add types or remove the field.` | -| Unknown product | `rules.publish.products: '{id}' not in available products. Available: {list}` | - -### Runtime (create/publish time) - -Prefixed with `[-exclude]` or `[+include]` for scanning: - -**Create:** -- `[-exclude] PR #{n}: skipped, label '{label}' matches rules.create.exclude (match: {mode})` -- `[+include] PR #{n}: created, label '{label}' matches rules.create.include (match: {mode})` -- `[+include] PR #{n}: skipped, no labels match rules.create.include [{labels}] (match: {mode})` -- Product: `[-exclude] PR #{n} ({product}): skipped, label '{label}' matches rules.create.products.{product}.exclude` - -**Publish:** -- `[-exclude] PR #{n}: hidden, type '{type}' in rules.publish.exclude_types` -- `[+include] PR #{n}: hidden, type '{type}' not in rules.publish.include_types` -- `[-exclude] PR #{n}: hidden, area '{area}' in rules.publish.exclude_areas (match_areas: {mode})` -- `[-exclude] PR #{n}: hidden, all areas [{areas}] in rules.publish.exclude_areas (match_areas: all)` -- `[+include] PR #{n}: hidden, areas [{areas}] not in rules.publish.include_areas (match_areas: {mode})` -- Product: same patterns with `rules.publish.products.{product}.` prefix - -## Files to Modify - -### 1. Domain model — enums and PublishBlocker -**`src/Elastic.Documentation/ReleaseNotes/PublishBlocker.cs`** - -- Add `MatchMode` enum (`Any`, `All`) -- Add `FieldMode` enum (`Exclude`, `Include`) -- Add to `PublishBlocker`: `MatchAreas` (MatchMode), `TypesMode` (FieldMode), `AreasMode` (FieldMode) - -### 2. Domain model — rename and restructure BlockConfiguration -**`src/Elastic.Documentation.Configuration/Changelog/BlockConfiguration.cs`** - -Rename to `RulesConfiguration` (or new file). Structure: -- `RulesConfiguration`: `Match` (MatchMode), `Create` (CreateRules?), `Publish` (PublishRules?) -- `CreateRules`: `Labels` (list), `Mode` (FieldMode), `Match` (MatchMode?), `ByProduct` (dict) -- `PublishRules`: `PublishBlocker` fields + `ByProduct` (dict of product-specific `PublishBlocker`s) -- Delete old `ProductBlockers` record - -### 3. Core blocking logic -**`src/Elastic.Documentation/ReleaseNotes/PublishBlockerExtensions.cs`** - -- `MatchesType()`: type vs list -- `MatchesArea()`: any/all matching -- `ShouldBlock()`: per-field mode (`Exclude` + match → blocked; `Include` + no match → blocked) - -### 4. YAML DTO (CLI path) -**`src/services/Elastic.Changelog/Serialization/ChangelogConfigurationYaml.cs`** - -- Rename `BlockConfigurationYaml` → `RulesConfigurationYaml` -- New `CreateRulesYaml`: `Exclude`/`Include` (string), `Match` (string?), `Products` (dict) -- Update `PublishBlockerYaml`: `MatchAreas`, `ExcludeTypes`/`IncludeTypes`, `ExcludeAreas`/`IncludeAreas`, `Products` (dict) -- Remove old fields (`Types`, `Areas`, `Create` string, root `Product`) -- Update parent `ChangelogConfigurationYaml`: rename `Block` → `Rules` - -### 5. YAML DTO (minimal/inline path) -**`src/Elastic.Documentation.Configuration/ReleaseNotes/ReleaseNotesSerialization.cs`** - -Mirror changes for minimal DTOs. Rename `BlockConfigMinimalDto` → `RulesConfigMinimalDto`, etc. - -### 6. Configuration parsing + validation -**`src/services/Elastic.Changelog/Configuration/ChangelogConfigurationLoader.cs`** - -- Detect old `block:` key → emit error -- Parse `rules:` with new structure -- Validate mutual exclusivity, match values, empty lists -- Resolve match inheritance chain - -### 7. Create blocking logic -Find where create labels are checked and update for include/exclude + match + runtime messages. - -### 8. Rendering utilities -**`src/services/Elastic.Changelog/Rendering/ChangelogRenderUtilities.cs`** - -- Update for new `publish.products` structure -- Add `[-exclude]` / `[+include]` prefixed runtime log messages - -### 9. Example config -**`config/changelog.example.yml`** — replace `block:` section with `rules:`. - -### 10. All references to BlockConfiguration -Find and update all code referencing `BlockConfiguration`, `Block`, `ProductBlockers` to use new names. - -### 11. Tests - -**Unit tests** (`PublishBlockerExtensionsTests.cs`): -- All mode/match combinations (exclude×any, exclude×all, include×any, include×all) -- Mixed modes (exclude_types + include_areas) -- Match inheritance (global → section → product) - -**Integration tests** (`BlockConfigurationTests.cs`): -- New format end-to-end -- Validation error messages (mutual exclusivity, invalid match, old `block:` key) -- Product overrides under publish.products and create.products -- Create include/exclude + match -- Runtime message prefixes `[-exclude]` / `[+include]` - -## Verification - -1. New unit tests for all mode/match combinations -2. Integration tests with new config format -3. Validation error tests — verify all error messages -4. Old `block:` key → error test -5. YAML parsing on both CLI and minimal paths -6. Runtime messages at create and publish time with correct prefixes -7. Match inheritance chain works correctly diff --git a/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs b/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs index 3636c0478..0e6ee09ce 100644 --- a/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs +++ b/src/Elastic.Documentation.Configuration/DocumentationEndpoints.cs @@ -26,9 +26,6 @@ public class ElasticsearchEndpoint public int IndexNumThreads { get; set; } = 4; // Reduced for Serverless rate limits public bool NoElasticInferenceService { get; set; } - // index options - public string IndexNamePrefix { get; set; } = "semantic-docs"; - // channel buffer options public int BufferSize { get; set; } = 50; // Reduced for Serverless rate limits public int MaxRetries { get; set; } = 5; // Increased for 429 retries diff --git a/src/Elastic.Documentation.Configuration/ElasticsearchEndpointConfigurator.cs b/src/Elastic.Documentation.Configuration/ElasticsearchEndpointConfigurator.cs index 035fe559c..4bd1586c1 100644 --- a/src/Elastic.Documentation.Configuration/ElasticsearchEndpointConfigurator.cs +++ b/src/Elastic.Documentation.Configuration/ElasticsearchEndpointConfigurator.cs @@ -28,7 +28,6 @@ public record ElasticsearchIndexOptions public int? BootstrapTimeout { get; init; } // index options - public string? IndexNamePrefix { get; init; } public bool? ForceReindex { get; init; } // channel buffer options @@ -84,8 +83,6 @@ public static async Task ApplyAsync( cfg.IndexNumThreads = options.IndexNumThreads.Value; if (options.NoEis.HasValue) cfg.NoElasticInferenceService = options.NoEis.Value; - if (!string.IsNullOrEmpty(options.IndexNamePrefix)) - cfg.IndexNamePrefix = options.IndexNamePrefix; if (options.BufferSize.HasValue) cfg.BufferSize = options.BufferSize.Value; if (options.MaxRetries.HasValue) diff --git a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs index f5101dec4..0e1acc17f 100644 --- a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs +++ b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs @@ -4,43 +4,34 @@ using Elastic.Mapping; using Elastic.Mapping.Analysis; +using Elastic.Mapping.Mappings; namespace Elastic.Documentation.Search; [ElasticsearchMappingContext] -[Entity( - Target = EntityTarget.Index, - Name = "docs-lexical", - WriteAlias = "docs-lexical", - ReadAlias = "docs-lexical", - SearchPattern = "docs-lexical-*", +[Index( + NameTemplate = "docs-{type}.lexical-{env}", DatePattern = "yyyy.MM.dd.HHmmss", Configuration = typeof(LexicalConfig) )] -[Entity( - Target = EntityTarget.Index, - Name = "docs-semantic", +[Index( + NameTemplate = "docs-{type}.semantic-{env}", Variant = "Semantic", - WriteAlias = "docs-semantic", - ReadAlias = "docs-semantic", - SearchPattern = "docs-semantic-*", DatePattern = "yyyy.MM.dd.HHmmss", Configuration = typeof(SemanticConfig) )] public static partial class DocumentationMappingContext; -public static class LexicalConfig : IConfigureElasticsearch +public class LexicalConfig : IConfigureElasticsearch { - public static AnalysisBuilder ConfigureAnalysis(AnalysisBuilder analysis) => analysis; - - public static DocumentationDocumentMappingsBuilder ConfigureMappings(DocumentationDocumentMappingsBuilder m) => - ConfigureCommonMappings(m) + public MappingsBuilder ConfigureMappings(MappingsBuilder mappings) => + ConfigureCommonMappings(mappings) .StrippedBody(f => f .Analyzer("synonyms_fixed_analyzer") .SearchAnalyzer("synonyms_analyzer") ); - internal static DocumentationDocumentMappingsBuilder ConfigureCommonMappings(DocumentationDocumentMappingsBuilder m) => m + internal static MappingsBuilder ConfigureCommonMappings(MappingsBuilder m) => m // Text fields with custom analyzers and multi-fields .SearchTitle(f => f .Analyzer("synonyms_fixed_analyzer") @@ -87,15 +78,13 @@ internal static DocumentationDocumentMappingsBuilder ConfigureCommonMappings(Doc .MultiField("keyword", mf => mf.Keyword())); } -public static class SemanticConfig +public class SemanticConfig : IConfigureElasticsearch { private const string ElserInferenceId = ".elser-2-elastic"; private const string JinaInferenceId = ".jina-embeddings-v5-text-small"; - public static AnalysisBuilder ConfigureAnalysis(AnalysisBuilder analysis) => analysis; - - public static DocumentationDocumentMappingsBuilder ConfigureMappings(DocumentationDocumentMappingsBuilder m) => - LexicalConfig.ConfigureCommonMappings(m) + public MappingsBuilder ConfigureMappings(MappingsBuilder mappings) => + LexicalConfig.ConfigureCommonMappings(mappings) .StrippedBody(s => s .Analyzer("synonyms_fixed_analyzer") .SearchAnalyzer("synonyms_analyzer") @@ -158,5 +147,4 @@ public static AnalysisBuilder BuildAnalysis(AnalysisBuilder analysis, string syn .TokenizeOnChars("whitespace", ",", ";", "?", "!", "(", ")", "&", "'", "\"", "/", "[", "]", "{", "}")) .Tokenizer("path_tokenizer", t => t.PathHierarchy() .Delimiter('/')); - } diff --git a/src/services/Elastic.Documentation.Assembler/Indexing/AssemblerIndexService.cs b/src/services/Elastic.Documentation.Assembler/Indexing/AssemblerIndexService.cs index cb84652c2..323129a40 100644 --- a/src/services/Elastic.Documentation.Assembler/Indexing/AssemblerIndexService.cs +++ b/src/services/Elastic.Documentation.Assembler/Indexing/AssemblerIndexService.cs @@ -37,7 +37,6 @@ ICoreService githubActionsService /// The number of index threads the inference endpoint should use. Defaults: 8 /// Do not use the Elastic Inference Service, bootstrap inference endpoint /// Timeout in minutes for the inference endpoint creation. Defaults: 4 - /// The prefix for the computed index/alias names. Defaults: semantic-docs /// Force reindex strategy to semantic index /// The number of documents to send to ES as part of the bulk. Defaults: 100 /// The number of times failed bulk items should be retried. Defaults: 3 @@ -65,7 +64,6 @@ public async Task Index(IDiagnosticsCollector collector, bool? noEis = null, int? bootstrapTimeout = null, // index options - string? indexNamePrefix = null, bool? forceReindex = null, // channel buffer options int? bufferSize = null, @@ -94,7 +92,6 @@ public async Task Index(IDiagnosticsCollector collector, IndexNumThreads = indexNumThreads, NoEis = noEis, BootstrapTimeout = bootstrapTimeout, - IndexNamePrefix = indexNamePrefix, ForceReindex = forceReindex, BufferSize = bufferSize, MaxRetries = maxRetries, diff --git a/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs b/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs index 9d996c8ed..19e060b1b 100644 --- a/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs +++ b/src/services/Elastic.Documentation.Isolated/IsolatedIndexService.cs @@ -34,7 +34,6 @@ ICoreService githubActionsService /// The number of index threads the inference endpoint should use. Defaults: 8 /// Do not use the Elastic Inference Service, bootstrap inference endpoint /// Timeout in minutes for the inference endpoint creation. Defaults: 4 - /// The prefix for the computed index/alias names. Defaults: semantic-docs /// Force reindex strategy to semantic index /// The number of documents to send to ES as part of the bulk. Defaults: 100 /// The number of times failed bulk items should be retried. Defaults: 3 @@ -62,7 +61,6 @@ public async Task Index(IDiagnosticsCollector collector, bool? noEis = null, int? bootstrapTimeout = null, // index options - string? indexNamePrefix = null, bool? forceReindex = null, // channel buffer options int? bufferSize = null, @@ -91,7 +89,6 @@ public async Task Index(IDiagnosticsCollector collector, IndexNumThreads = indexNumThreads, NoEis = noEis, BootstrapTimeout = bootstrapTimeout, - IndexNamePrefix = indexNamePrefix, ForceReindex = forceReindex, BufferSize = bufferSize, MaxRetries = maxRetries, diff --git a/src/tooling/docs-builder/Commands/Assembler/AssemblerIndexCommand.cs b/src/tooling/docs-builder/Commands/Assembler/AssemblerIndexCommand.cs index ba3e8ebc9..df29d5666 100644 --- a/src/tooling/docs-builder/Commands/Assembler/AssemblerIndexCommand.cs +++ b/src/tooling/docs-builder/Commands/Assembler/AssemblerIndexCommand.cs @@ -34,7 +34,6 @@ ICoreService githubActionsService /// The number of search threads the inference endpoint should use. Defaults: 8 /// The number of index threads the inference endpoint should use. Defaults: 8 /// Do not use the Elastic Inference Service, bootstrap inference endpoint - /// The prefix for the computed index/alias names. Defaults: semantic-docs /// Force reindex strategy to semantic index /// Timeout in minutes for the inference endpoint creation. Defaults: 4 /// The number of documents to send to ES as part of the bulk. Defaults: 100 @@ -65,7 +64,6 @@ public async Task Index( int? bootstrapTimeout = null, // index options - string? indexNamePrefix = null, bool? forceReindex = null, // channel buffer options @@ -97,7 +95,7 @@ public async Task Index( // inference options enableAiEnrichment, indexNumThreads, searchNumThreads, noEis, bootstrapTimeout, // channel and connection options - indexNamePrefix, forceReindex, bufferSize, maxRetries, debugMode, + forceReindex, bufferSize, maxRetries, debugMode, // proxy options proxyAddress, proxyPassword, proxyUsername, // certificate options @@ -110,7 +108,7 @@ static async (s, collector, state, ctx) => await s.Index(collector, state.fs, // inference options state.enableAiEnrichment, state.searchNumThreads, state.indexNumThreads, state.noEis, state.bootstrapTimeout, // channel and connection options - state.indexNamePrefix, state.forceReindex, state.bufferSize, state.maxRetries, state.debugMode, + state.forceReindex, state.bufferSize, state.maxRetries, state.debugMode, // proxy options state.proxyAddress, state.proxyPassword, state.proxyUsername, // certificate options diff --git a/src/tooling/docs-builder/Commands/Codex/CodexIndexCommand.cs b/src/tooling/docs-builder/Commands/Codex/CodexIndexCommand.cs index cefabf0d5..ae11fbb96 100644 --- a/src/tooling/docs-builder/Commands/Codex/CodexIndexCommand.cs +++ b/src/tooling/docs-builder/Commands/Codex/CodexIndexCommand.cs @@ -40,7 +40,6 @@ ICoreService githubActionsService /// The number of search threads the inference endpoint should use. Defaults: 8 /// The number of index threads the inference endpoint should use. Defaults: 8 /// Do not use the Elastic Inference Service, bootstrap inference endpoint - /// The prefix for the computed index/alias names. Defaults: semantic-docs /// Force reindex strategy to semantic index /// Timeout in minutes for the inference endpoint creation. Defaults: 4 /// The number of documents to send to ES as part of the bulk. Defaults: 100 @@ -71,7 +70,6 @@ public async Task Index( int? bootstrapTimeout = null, // index options - string? indexNamePrefix = null, bool? forceReindex = null, // channel buffer options @@ -137,7 +135,6 @@ public async Task Index( IndexNumThreads = indexNumThreads, NoEis = noEis, BootstrapTimeout = bootstrapTimeout, - IndexNamePrefix = indexNamePrefix, ForceReindex = forceReindex, BufferSize = bufferSize, MaxRetries = maxRetries, diff --git a/src/tooling/docs-builder/Commands/IndexCommand.cs b/src/tooling/docs-builder/Commands/IndexCommand.cs index ad34f4eaf..ff402ce16 100644 --- a/src/tooling/docs-builder/Commands/IndexCommand.cs +++ b/src/tooling/docs-builder/Commands/IndexCommand.cs @@ -31,7 +31,6 @@ ICoreService githubActionsService /// Enable AI enrichment of documents using LLM-generated metadata /// The number of search threads the inference endpoint should use. Defaults: 8 /// The number of index threads the inference endpoint should use. Defaults: 8 - /// The prefix for the computed index/alias names. Defaults: semantic-docs /// Do not use the Elastic Inference Service, bootstrap inference endpoint /// Force reindex strategy to semantic index /// Timeout in minutes for the inference endpoint creation. Defaults: 4 @@ -63,7 +62,6 @@ public async Task Index( int? bootstrapTimeout = null, // index options - string? indexNamePrefix = null, bool? forceReindex = null, // channel buffer options @@ -95,7 +93,7 @@ public async Task Index( // inference options enableAiEnrichment, indexNumThreads, noEis, searchNumThreads, bootstrapTimeout, // channel and connection options - indexNamePrefix, forceReindex, bufferSize, maxRetries, debugMode, + forceReindex, bufferSize, maxRetries, debugMode, // proxy options proxyAddress, proxyPassword, proxyUsername, // certificate options @@ -108,7 +106,7 @@ static async (s, collector, state, ctx) => await s.Index(collector, state.fs, st // inference options state.enableAiEnrichment, state.searchNumThreads, state.indexNumThreads, state.noEis, state.bootstrapTimeout, // channel and connection options - state.indexNamePrefix, state.forceReindex, state.bufferSize, state.maxRetries, state.debugMode, + state.forceReindex, state.bufferSize, state.maxRetries, state.debugMode, // proxy options state.proxyAddress, state.proxyPassword, state.proxyUsername, // certificate options From 40b3241a97f1fc133445845df94eb129e18ec234 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Tue, 24 Feb 2026 19:18:40 +0100 Subject: [PATCH 13/15] Bump ingest libraries --- Directory.Packages.props | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index 5ca9831a0..39eff5fef 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -48,8 +48,8 @@ - - + + From a19a4cff59b58fa58e00b84cae296d583169be92 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Tue, 24 Feb 2026 19:20:57 +0100 Subject: [PATCH 14/15] Bump ingest libraries --- Directory.Packages.props | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index 39eff5fef..87f4bd481 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -48,8 +48,8 @@ - - + + From b07ce2a6a9e803e43d1f9a8242d463a048bb6155 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Thu, 26 Feb 2026 13:47:25 +0100 Subject: [PATCH 15/15] Replace handrolled AI enrichment with Elastic.Ingest 0.30.0 AiEnrichmentOrchestrator Upgrade Elastic.Ingest.Elasticsearch and Elastic.Mapping to 0.30.0 which includes source-generated AI enrichment support (elastic/elastic-ingest-dotnet#151). - Annotate DocumentationDocument with [AiInput]/[AiField] attributes - Add [AiEnrichment] to DocumentationMappingContext - Replace ElasticsearchEnrichmentCache + ElasticsearchLlmClient + EnrichPolicyManager with a single AiEnrichmentOrchestrator that runs post-indexing - Remove 7 handrolled enrichment files (~1650 lines) and associated tests Made-with: Cursor --- Directory.Packages.props | 4 +- .../Search/DocumentationDocument.cs | 44 +- .../Search/DocumentationMappingConfig.cs | 4 + .../ElasticsearchMarkdownExporter.Export.cs | 94 ---- .../ElasticsearchMarkdownExporter.cs | 98 +--- .../ElasticsearchEnrichmentCache.cs | 374 -------------- .../Enrichment/ElasticsearchLlmClient.cs | 455 ------------------ .../Enrichment/EnrichPolicyManager.cs | 177 ------- .../Enrichment/EnrichmentKeyGenerator.cs | 31 -- .../Enrichment/EnrichmentOptions.cs | 32 -- .../Enrichment/IEnrichmentCache.cs | 42 -- .../Elasticsearch/Enrichment/ILlmClient.cs | 57 --- .../Enrichment/ElasticsearchLlmClientTests.cs | 304 ------------ 13 files changed, 35 insertions(+), 1681 deletions(-) delete mode 100644 src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ElasticsearchEnrichmentCache.cs delete mode 100644 src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ElasticsearchLlmClient.cs delete mode 100644 src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichPolicyManager.cs delete mode 100644 src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichmentKeyGenerator.cs delete mode 100644 src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichmentOptions.cs delete mode 100644 src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/IEnrichmentCache.cs delete mode 100644 src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ILlmClient.cs delete mode 100644 tests/Elastic.Markdown.Tests/Enrichment/ElasticsearchLlmClientTests.cs diff --git a/Directory.Packages.props b/Directory.Packages.props index 87f4bd481..fe5e20dda 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -48,8 +48,8 @@ - - + + diff --git a/src/Elastic.Documentation/Search/DocumentationDocument.cs b/src/Elastic.Documentation/Search/DocumentationDocument.cs index bfbaace5d..a949d9dae 100644 --- a/src/Elastic.Documentation/Search/DocumentationDocument.cs +++ b/src/Elastic.Documentation/Search/DocumentationDocument.cs @@ -20,6 +20,7 @@ public record ParentDocument public record DocumentationDocument { + [AiInput] [JsonPropertyName("title")] public required string Title { get; set; } @@ -101,6 +102,7 @@ public record DocumentationDocument public string? Body { get; set; } /// Stripped body is the body with Markdown removed, suitable for search indexing + [AiInput] [JsonPropertyName("stripped_body")] public string? StrippedBody { get; set; } @@ -115,63 +117,37 @@ public record DocumentationDocument [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)] public bool Hidden { get; set; } - // AI Enrichment fields - populated by DocumentEnrichmentService + // AI Enrichment fields - populated post-indexing by AiEnrichmentOrchestrator - /// - /// Key for enrichment cache lookups. Derived from normalized content + prompt hash. - /// Used by enrich processor to join AI-generated fields at index time. - /// - [Keyword] - [JsonPropertyName("enrichment_key")] - [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] - public string? EnrichmentKey { get; set; } - - /// - /// 3-5 sentences dense with technical entities, API names, and core functionality for vector matching. - /// + [AiField("3-5 sentences densely packed with technical entities for semantic vector matching. Include: API endpoint names, method names, parameter names, configuration options, data types, and core functionality. Write for RAG retrieval - someone asking 'how do I configure X' should match this text.")] [Text] [JsonPropertyName("ai_rag_optimized_summary")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string? AiRagOptimizedSummary { get; set; } - /// - /// Exactly 5-10 words for a UI tooltip. - /// + [AiField("Exactly 5-10 words for UI tooltip or search snippet. Action-oriented, starts with a verb. Example: 'Configure index lifecycle policies for data retention'")] [Text] [JsonPropertyName("ai_short_summary")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string? AiShortSummary { get; set; } - /// - /// A 3-8 word keyword string representing a high-intent user search for this doc. - /// + [AiField("3-8 keywords representing a realistic search query a developer would type. Include product name and key technical terms. Example: 'elasticsearch bulk api batch indexing'")] [Keyword] [JsonPropertyName("ai_search_query")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string? AiSearchQuery { get; set; } - /// - /// Array of 3-5 specific questions answered by this document. - /// + [AiField("Natural questions a dev would ask (6-15 words). Not too short, not too verbose. Examples: 'How do I bulk index documents?', 'What format does the bulk API use?', 'Why is my bulk request failing?'", + MinItems = 3, MaxItems = 5)] [Text] [JsonPropertyName("ai_questions")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string[]? AiQuestions { get; set; } - /// - /// Array of 2-4 specific use cases this doc helps with. - /// + [AiField("Simple 2-4 word tasks a dev wants to do. Examples: 'index documents', 'check cluster health', 'enable TLS', 'fix slow queries', 'backup data'", + MinItems = 2, MaxItems = 4)] [Text] [JsonPropertyName("ai_use_cases")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] public string[]? AiUseCases { get; set; } - - /// - /// Hash of the LLM prompt templates used to generate AI fields. - /// Used to detect stale enrichments when prompts change. - /// - [Keyword] - [JsonPropertyName("enrichment_prompt_hash")] - [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] - public string? EnrichmentPromptHash { get; set; } } diff --git a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs index 0e1acc17f..999f61bd6 100644 --- a/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs +++ b/src/Elastic.Documentation/Search/DocumentationMappingConfig.cs @@ -20,6 +20,10 @@ namespace Elastic.Documentation.Search; DatePattern = "yyyy.MM.dd.HHmmss", Configuration = typeof(SemanticConfig) )] +[AiEnrichment( + Role = "Expert technical writer creating search metadata for Elastic documentation (Elasticsearch, Kibana, Beats, Logstash). Audience: developers, DevOps, data engineers.", + MatchField = "url" +)] public static partial class DocumentationMappingContext; public class LexicalConfig : IConfigureElasticsearch diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs index e69dfc9e5..548a909a6 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.Export.cs @@ -10,7 +10,6 @@ using Elastic.Documentation.Navigation; using Elastic.Documentation.Search; using Elastic.Ingest.Elasticsearch.Indices; -using Elastic.Markdown.Exporters.Elasticsearch.Enrichment; using Markdig.Syntax; using Microsoft.Extensions.Logging; using static System.StringSplitOptions; @@ -154,13 +153,6 @@ public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, : null; CommonEnrichments(doc, currentNavigation); - - // AI Enrichment - hybrid approach: - // - Cache hits: enrich processor applies fields at index time - // - Cache misses: apply fields inline before indexing - doc.EnrichmentKey = EnrichmentKeyGenerator.Generate(doc.Title, doc.StrippedBody ?? string.Empty); - await TryEnrichDocumentAsync(doc, ctx); - AssignDocumentMetadata(doc); return await WriteDocumentAsync(doc, ctx); @@ -198,11 +190,6 @@ public async ValueTask FinishExportAsync(IDirectoryInfo outputFolder, Canc doc.Abstract = @abstract; doc.Headings = headings; CommonEnrichments(doc, null); - - // AI Enrichment - hybrid approach - doc.EnrichmentKey = EnrichmentKeyGenerator.Generate(doc.Title, doc.StrippedBody ?? string.Empty); - await TryEnrichDocumentAsync(doc, ctx); - AssignDocumentMetadata(doc); if (!await WriteDocumentAsync(doc, ctx)) @@ -216,85 +203,4 @@ public async ValueTask FinishExportAsync(IDirectoryInfo outputFolder, Canc return true; } - /// - /// Hybrid AI enrichment with staleness detection: - /// - Fresh cache hit: enrich processor will apply fields at index time - /// - Stale cache hit: apply stale data inline (better than nothing), queue for re-enrichment - /// - Cache miss: generate new enrichment if under limit, otherwise no AI fields - /// - private async ValueTask TryEnrichDocumentAsync(DocumentationDocument doc, Cancel ctx) - { - if (_enrichmentCache is null || _llmClient is null || string.IsNullOrWhiteSpace(doc.EnrichmentKey)) - return; - - // Check cache status by URL (handles content changes gracefully) - var status = _enrichmentCache.GetStatus(doc.Url, doc.EnrichmentKey); - - if (status.Exists && !status.IsStale) - { - // Fresh cache hit - enrich processor will apply fields at index time - _ = Interlocked.Increment(ref _cacheHitCount); - return; - } - - if (status.Exists && status.Entry is not null) - { - // Stale cache hit - apply stale data inline (better than nothing) - ApplyEnrichmentFromCache(doc, status.Entry); - _logger.LogDebug("Applied stale enrichment for {Url} (content changed)", doc.Url); - } - - // Try to generate fresh enrichment if under limit - var current = Interlocked.Increment(ref _enrichmentCount); - if (current > _enrichmentOptions.MaxNewEnrichmentsPerRun) - { - _ = Interlocked.Decrement(ref _enrichmentCount); - // Already applied stale data if available, otherwise no AI fields - return; - } - - // Generate fresh enrichment - try - { - var body = doc.StrippedBody ?? string.Empty; - var enrichment = await _llmClient.EnrichAsync(doc.Title, body, ctx); - if (enrichment is not { HasData: true }) - { - _logger.LogWarning( - "Enrichment returned no data for {Url} (title: {Title}, body: {BodyLength} chars)", - doc.Url, doc.Title, body.Length); - return; - } - - // Store in cache for future runs - await _enrichmentCache.StoreAsync(doc.EnrichmentKey, doc.Url, enrichment, ctx); - - // Apply fresh fields directly - doc.AiRagOptimizedSummary = enrichment.RagOptimizedSummary; - doc.AiShortSummary = enrichment.ShortSummary; - doc.AiSearchQuery = enrichment.SearchQuery; - doc.AiQuestions = enrichment.Questions; - doc.AiUseCases = enrichment.UseCases; - doc.EnrichmentPromptHash = ElasticsearchLlmClient.PromptHash; - } - catch (Exception ex) when (ex is not OperationCanceledException) - { - _logger.LogWarning(ex, "Failed to enrich document {Url}", doc.Url); - _ = Interlocked.Decrement(ref _enrichmentCount); - } - } - - /// - /// Applies AI enrichment fields from a cache entry to the document. - /// - private static void ApplyEnrichmentFromCache(DocumentationDocument doc, CacheIndexEntry entry) - { - doc.AiRagOptimizedSummary = entry.AiRagOptimizedSummary; - doc.AiShortSummary = entry.AiShortSummary; - doc.AiSearchQuery = entry.AiSearchQuery; - doc.AiQuestions = entry.AiQuestions; - doc.AiUseCases = entry.AiUseCases; - doc.EnrichmentPromptHash = entry.PromptHash; - } - } diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs index 6a6a510dd..7f0029f65 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs @@ -12,9 +12,9 @@ using Elastic.Documentation.Search; using Elastic.Documentation.Serialization; using Elastic.Ingest.Elasticsearch; +using Elastic.Ingest.Elasticsearch.Enrichment; using Elastic.Ingest.Elasticsearch.Indices; using Elastic.Mapping; -using Elastic.Markdown.Exporters.Elasticsearch.Enrichment; using Elastic.Transport; using Microsoft.Extensions.Logging; @@ -41,13 +41,8 @@ public partial class ElasticsearchMarkdownExporter : IMarkdownExporter, IDisposa private readonly VersionsConfiguration _versionsConfiguration; private readonly string _fixedSynonymsHash; - // AI Enrichment - hybrid approach: cache hits use enrich processor, misses are applied inline - private readonly ElasticsearchEnrichmentCache? _enrichmentCache; - private readonly ElasticsearchLlmClient? _llmClient; - private readonly EnrichPolicyManager? _enrichPolicyManager; - private readonly EnrichmentOptions _enrichmentOptions = new(); - private int _enrichmentCount; - private int _cacheHitCount; + // AI Enrichment - post-indexing via AiEnrichmentOrchestrator + private readonly AiEnrichmentOrchestrator? _aiEnrichment; // Shared ES operations with retry and task polling private readonly ElasticsearchOperations _operations; @@ -82,9 +77,14 @@ IDocumentationConfigurationContext context }).Where(r => fixedSynonyms.Contains(r.Id)).Select(r => r.Synonyms).ToArray(); _fixedSynonymsHash = HashedBulkUpdate.CreateHash(string.Join(",", indexTimeSynonyms)); - var aiPipeline = es.EnableAiEnrichment ? EnrichPolicyManager.PipelineName : null; var synonymSetName = $"docs-{buildType}"; + if (es.EnableAiEnrichment) + { + _aiEnrichment = new AiEnrichmentOrchestrator(_transport, DocumentationMappingContext.AiEnrichment); + } + + var aiPipeline = es.EnableAiEnrichment ? DocumentationMappingContext.AiEnrichment.PipelineName : null; var pipelineSettings = aiPipeline is not null ? new Dictionary { ["index.default_pipeline"] = aiPipeline } : null; @@ -95,14 +95,6 @@ IDocumentationConfigurationContext context IndexSettings = pipelineSettings }; - // Initialize AI enrichment services if enabled - if (es.EnableAiEnrichment) - { - _enrichmentCache = new ElasticsearchEnrichmentCache(_transport, logFactory.CreateLogger(), _operations); - _llmClient = new ElasticsearchLlmClient(_transport, logFactory.CreateLogger(), _operations); - _enrichPolicyManager = new EnrichPolicyManager(_transport, logFactory.CreateLogger(), _enrichmentCache.IndexName); - } - _semanticTypeContext = DocumentationMappingContext.DocumentationDocumentSemantic.CreateContext(type: buildType) with { ConfigureAnalysis = a => DocumentationAnalysisFactory.BuildAnalysis(a, synonymSetName, indexTimeSynonyms), @@ -117,13 +109,14 @@ IDocumentationConfigurationContext context { ConfigurePrimary = ConfigureChannelOptions, ConfigureSecondary = ConfigureChannelOptions, - OnPostComplete = es.EnableAiEnrichment + OnPostComplete = _aiEnrichment is not null ? async (ctx, _, ct) => await PostCompleteAsync(ctx, ct) : null }; _ = _orchestrator.AddPreBootstrapTask(async (_, ct) => { - await InitializeEnrichmentAsync(ct); + if (_aiEnrichment is not null) + await _aiEnrichment.InitializeAsync(ct); await PublishSynonymsAsync(ct); await PublishQueryRulesAsync(ct); }); @@ -164,69 +157,16 @@ public async ValueTask StartAsync(Cancel ctx = default) public async ValueTask StopAsync(Cancel ctx = default) => _ = await _orchestrator.CompleteAsync(null, ctx); - private async Task InitializeEnrichmentAsync(Cancel ctx) - { - if (_enrichmentCache is null || _enrichPolicyManager is null) - return; - - _logger.LogInformation("Initializing AI enrichment cache..."); - await _enrichmentCache.InitializeAsync(ctx); - _logger.LogInformation("AI enrichment cache ready with {Count} existing entries", _enrichmentCache.Count); - - _logger.LogInformation("Setting up enrich policy and pipeline..."); - await _enrichPolicyManager.ExecutePolicyAsync(ctx); - await _enrichPolicyManager.EnsurePipelineExistsAsync(ctx); - } - - private async Task PostCompleteAsync(OrchestratorContext context, Cancel ctx) => - await ExecuteEnrichPolicyIfNeededAsync(context.SecondaryWriteAlias, ctx); - - private async ValueTask ExecuteEnrichPolicyIfNeededAsync(string? semanticAlias, Cancel ctx) + private async Task PostCompleteAsync(OrchestratorContext context, Cancel ctx) { - if (_enrichmentCache is null || _enrichPolicyManager is null) + if (_aiEnrichment is null || context.SecondaryWriteAlias is null) return; + _logger.LogInformation("Starting post-indexing AI enrichment for {Alias}...", context.SecondaryWriteAlias); + var result = await _aiEnrichment.EnrichAsync(context.SecondaryWriteAlias, ctx); _logger.LogInformation( - "AI enrichment complete: {CacheHits} cache hits, {Enrichments} enrichments generated (limit: {Limit})", - _cacheHitCount, _enrichmentCount, _enrichmentOptions.MaxNewEnrichmentsPerRun); - - if (_enrichmentCache.Count > 0) - { - _logger.LogInformation("Executing enrich policy to update internal index with {Count} total entries...", _enrichmentCache.Count); - await _enrichPolicyManager.ExecutePolicyAsync(ctx); - - if (semanticAlias is not null) - await BackfillMissingAiFieldsAsync(semanticAlias, ctx); - } - } - - private async ValueTask BackfillMissingAiFieldsAsync(string semanticAlias, Cancel ctx) - { - if (_enrichmentCache is null || _llmClient is null) - return; - - var currentPromptHash = ElasticsearchLlmClient.PromptHash; - - _logger.LogInformation( - "Starting AI backfill for documents missing or stale AI fields (cache has {CacheCount} entries, prompt hash: {PromptHash})", - _enrichmentCache.Count, currentPromptHash[..8]); - - var query = $$""" - { - "query": { - "bool": { - "must": { "exists": { "field": "enrichment_key" } }, - "should": [ - { "bool": { "must_not": { "exists": { "field": "ai_questions" } } } }, - { "bool": { "must_not": { "term": { "enrichment_prompt_hash": "{{currentPromptHash}}" } } } } - ], - "minimum_should_match": 1 - } - } - } - """; - - await _operations.UpdateByQueryAsync(semanticAlias, PostData.String(query), EnrichPolicyManager.PipelineName, ctx); + "AI enrichment complete: {Enriched} enriched, {Failed} failed, {Skipped} skipped, {Total} candidates, reached limit: {ReachedLimit}", + result.Enriched, result.Failed, result.Skipped, result.TotalCandidates, result.ReachedLimit); } private async Task PublishSynonymsAsync(Cancel ctx) @@ -317,7 +257,7 @@ internal async ValueTask WriteDocumentAsync(DocumentationDocument doc, Can public void Dispose() { _orchestrator.Dispose(); - _llmClient?.Dispose(); + _aiEnrichment?.Dispose(); GC.SuppressFinalize(this); } } diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ElasticsearchEnrichmentCache.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ElasticsearchEnrichmentCache.cs deleted file mode 100644 index 9a166f8d6..000000000 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ElasticsearchEnrichmentCache.cs +++ /dev/null @@ -1,374 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -using System.Collections.Concurrent; -using System.Linq; -using System.Text.Json; -using System.Text.Json.Serialization; -using Elastic.Transport; -using Elastic.Transport.Products.Elasticsearch; -using Microsoft.Extensions.Logging; - -namespace Elastic.Markdown.Exporters.Elasticsearch.Enrichment; - -/// -/// Represents the status of a cache entry for a given URL. -/// -public record CacheStatus(bool Exists, bool IsStale, CacheIndexEntry? Entry); - -/// -/// Elasticsearch-backed enrichment cache for use with the enrich processor. -/// Stores AI-generated enrichment fields directly (not as JSON string) for efficient lookups. -/// Provides URL-based lookups with staleness detection based on enrichment_key changes. -/// -public sealed class ElasticsearchEnrichmentCache( - ITransport transport, - ILogger logger, - ElasticsearchOperations? operations = null, - string indexName = "docs-ai-enriched-fields-cache") : IEnrichmentCache -{ - private readonly ITransport _transport = transport; - private readonly ILogger _logger = logger; - private readonly ElasticsearchOperations? _operations = operations; - - // Maps URL to cache entry for staleness detection - private readonly ConcurrentDictionary _entriesByUrl = new(); - - // Valid enrichment keys (with current prompt hash) for quick existence check - private readonly ConcurrentDictionary _validEntries = new(); - - public string IndexName { get; } = indexName; - - // language=json - // Note: No settings block - Serverless doesn't allow number_of_shards/replicas - private const string IndexMapping = """ - { - "mappings": { - "properties": { - "enrichment_key": { "type": "keyword" }, - "url": { "type": "keyword" }, - "ai_rag_optimized_summary": { "type": "text" }, - "ai_short_summary": { "type": "text" }, - "ai_search_query": { "type": "text" }, - "ai_questions": { "type": "text" }, - "ai_use_cases": { "type": "text" }, - "created_at": { "type": "date" }, - "prompt_hash": { "type": "keyword" } - } - } - } - """; - - /// - /// Number of valid cache entries (with current prompt hash). - /// - public int Count => _validEntries.Count; - - /// - /// Number of stale entries found during initialization (for logging only). - /// - public int StaleCount { get; private set; } - - public async Task InitializeAsync(CancellationToken ct) - { - await EnsureIndexExistsAsync(ct); - await LoadExistingHashesAsync(ct); - } - - /// - /// Checks if a valid enrichment exists in the cache (with current prompt hash). - /// Stale entries are treated as non-existent and will be regenerated. - /// - public bool Exists(string enrichmentKey) => _validEntries.ContainsKey(enrichmentKey); - - /// - /// Gets the cache status for a URL, including staleness detection. - /// - /// The document URL to look up - /// The current enrichment key (content hash) of the document - /// Cache status indicating if entry exists and whether it's stale - public CacheStatus GetStatus(string url, string currentEnrichmentKey) - { - if (!_entriesByUrl.TryGetValue(url, out var entry)) - return new CacheStatus(false, false, null); - - var isStale = entry.EnrichmentKey != currentEnrichmentKey; - return new CacheStatus(true, isStale, entry); - } - - public async Task StoreAsync(string enrichmentKey, string url, EnrichmentData data, CancellationToken ct) - { - var promptHash = ElasticsearchLlmClient.PromptHash; - var cacheEntry = new CacheIndexEntry - { - EnrichmentKey = enrichmentKey, - Url = url, - AiRagOptimizedSummary = data.RagOptimizedSummary, - AiShortSummary = data.ShortSummary, - AiSearchQuery = data.SearchQuery, - AiQuestions = data.Questions, - AiUseCases = data.UseCases, - CreatedAt = DateTimeOffset.UtcNow, - PromptHash = promptHash - }; - - var body = JsonSerializer.Serialize(cacheEntry, EnrichmentSerializerContext.Default.CacheIndexEntry); - var response = await _transport.PutAsync( - $"{IndexName}/_doc/{enrichmentKey}", - PostData.String(body), - ct); - - if (response.ApiCallDetails.HasSuccessfulStatusCode) - { - _ = _validEntries.TryAdd(enrichmentKey, 0); - _ = _entriesByUrl.AddOrUpdate(url, cacheEntry, (_, _) => cacheEntry); - - // Clean up any older entries for the same URL (but different enrichment_key) - await DeleteOldEntriesForUrlAsync(enrichmentKey, url, ct); - } - else - { - _logger.LogWarning("Failed to store enrichment: {StatusCode}", response.ApiCallDetails.HttpStatusCode); - } - } - - /// - /// Deletes older cache entries for the same URL, keeping only the current one. - /// This cleans up stale entries left behind when documents are re-enriched. - /// Uses wait_for_completion=false for async execution (fire-and-forget). - /// - private async Task DeleteOldEntriesForUrlAsync(string currentEnrichmentKey, string url, CancellationToken ct) - { - // Delete all entries with this URL except the current one - var deleteQuery = PostData.String($$""" - { - "query": { - "bool": { - "must": { "term": { "url": "{{url}}" } }, - "must_not": { "term": { "_id": "{{currentEnrichmentKey}}" } } - } - } - } - """); - - if (_operations is not null) - { - // Use shared operations for fire-and-forget delete - var taskId = await _operations.DeleteByQueryFireAndForgetAsync(IndexName, deleteQuery, ct); - if (taskId is not null) - _logger.LogDebug("Started cleanup task {TaskId} for URL {Url}", taskId, url); - } - else - { - // Fallback for when operations not provided - var response = await _transport.PostAsync( - $"{IndexName}/_delete_by_query?wait_for_completion=false", - deleteQuery, - ct); - - if (response.ApiCallDetails.HasSuccessfulStatusCode) - { - using var doc = JsonDocument.Parse(response.Body ?? "{}"); - if (doc.RootElement.TryGetProperty("task", out var taskProp)) - _logger.LogDebug("Started cleanup task {TaskId} for URL {Url}", taskProp.GetString(), url); - } - } - } - - private async Task EnsureIndexExistsAsync(CancellationToken ct) - { - var existsResponse = await _transport.HeadAsync(IndexName, ct); - if (existsResponse.ApiCallDetails.HasSuccessfulStatusCode) - { - _logger.LogDebug("Enrichment cache index {IndexName} already exists", IndexName); - return; - } - - _logger.LogInformation("Creating enrichment cache index {IndexName}...", IndexName); - var createResponse = await _transport.PutAsync( - IndexName, - PostData.String(IndexMapping), - ct); - - if (createResponse.ApiCallDetails.HasSuccessfulStatusCode) - _logger.LogInformation("Created enrichment cache index {IndexName}", IndexName); - else if (createResponse.ApiCallDetails.HttpStatusCode == 400 && - createResponse.Body?.Contains("resource_already_exists_exception") == true) - _logger.LogDebug("Enrichment cache index {IndexName} already exists (race condition)", IndexName); - else - _logger.LogError("Failed to create cache index: {StatusCode} - {Response}", - createResponse.ApiCallDetails.HttpStatusCode, createResponse.Body); - } - - private async Task LoadExistingHashesAsync(CancellationToken ct) - { - var sw = System.Diagnostics.Stopwatch.StartNew(); - var currentPromptHash = ElasticsearchLlmClient.PromptHash; - var staleCount = 0; - var totalCount = 0; - - // Fetch all fields needed for URL-based lookups and staleness detection - var scrollQuery = /*lang=json,strict*/ """{"size": 1000, "_source": ["enrichment_key", "url", "prompt_hash", "ai_rag_optimized_summary", "ai_short_summary", "ai_search_query", "ai_questions", "ai_use_cases", "created_at"], "query": {"match_all": {}}}"""; - - var searchResponse = await _transport.PostAsync( - $"{IndexName}/_search?scroll=1m", - PostData.String(scrollQuery), - ct); - - if (!searchResponse.ApiCallDetails.HasSuccessfulStatusCode) - { - _logger.LogWarning("Failed to load existing hashes: {StatusCode}", searchResponse.ApiCallDetails.HttpStatusCode); - return; - } - - var (batchTotal, batchStale, scrollId) = ProcessHashHits(searchResponse.Body, currentPromptHash); - totalCount += batchTotal; - staleCount += batchStale; - - while (scrollId is not null && batchTotal > 0) - { - var scrollBody = $$"""{"scroll": "1m", "scroll_id": "{{scrollId}}"}"""; - var scrollResponse = await _transport.PostAsync( - "_search/scroll", - PostData.String(scrollBody), - ct); - - if (!scrollResponse.ApiCallDetails.HasSuccessfulStatusCode) - break; - - (batchTotal, batchStale, scrollId) = ProcessHashHits(scrollResponse.Body, currentPromptHash); - totalCount += batchTotal; - staleCount += batchStale; - } - - StaleCount = staleCount; - _logger.LogInformation( - "Loaded {Total} enrichment cache entries: {Valid} valid (current prompt), {Stale} stale (will be refreshed) in {ElapsedMs}ms", - totalCount, _validEntries.Count, staleCount, sw.ElapsedMilliseconds); - } - - private (int total, int stale, string? scrollId) ProcessHashHits(string? responseBody, string currentPromptHash) - { - if (string.IsNullOrEmpty(responseBody)) - return (0, 0, null); - - using var doc = JsonDocument.Parse(responseBody); - - var scrollId = doc.RootElement.TryGetProperty("_scroll_id", out var scrollIdProp) - ? scrollIdProp.GetString() - : null; - - if (!doc.RootElement.TryGetProperty("hits", out var hitsObj) || - !hitsObj.TryGetProperty("hits", out var hitsArray)) - return (0, 0, scrollId); - - var total = 0; - var stale = 0; - foreach (var hit in hitsArray.EnumerateArray()) - { - if (!hit.TryGetProperty("_id", out var idProp) || !hit.TryGetProperty("_source", out var source)) - continue; - - var id = idProp.GetString(); - if (id is null) - continue; - - total++; - - // Parse the cache entry - var cacheEntry = ParseCacheEntry(source, id); - if (cacheEntry?.Url is not null) - { - // Store by URL for staleness detection - keep latest entry per URL - _ = _entriesByUrl.AddOrUpdate(cacheEntry.Url, cacheEntry, - (_, existing) => cacheEntry.CreatedAt > existing.CreatedAt ? cacheEntry : existing); - } - - // Only add to valid entries if current prompt hash - if (source.TryGetProperty("prompt_hash", out var promptHashProp) && - promptHashProp.GetString() == currentPromptHash) - { - _ = _validEntries.TryAdd(id, 0); - } - else - { - stale++; - } - } - return (total, stale, scrollId); - } - - private CacheIndexEntry? ParseCacheEntry(JsonElement source, string id) - { - try - { - return new CacheIndexEntry - { - EnrichmentKey = source.TryGetProperty("enrichment_key", out var ek) ? ek.GetString() ?? id : id, - Url = source.TryGetProperty("url", out var url) ? url.GetString() : null, - AiRagOptimizedSummary = source.TryGetProperty("ai_rag_optimized_summary", out var rag) ? rag.GetString() : null, - AiShortSummary = source.TryGetProperty("ai_short_summary", out var ss) ? ss.GetString() : null, - AiSearchQuery = source.TryGetProperty("ai_search_query", out var sq) ? sq.GetString() : null, - AiQuestions = source.TryGetProperty("ai_questions", out var q) ? ParseStringArray(q) : null, - AiUseCases = source.TryGetProperty("ai_use_cases", out var uc) ? ParseStringArray(uc) : null, - CreatedAt = source.TryGetProperty("created_at", out var ca) && ca.TryGetDateTimeOffset(out var dt) ? dt : DateTimeOffset.MinValue, - PromptHash = source.TryGetProperty("prompt_hash", out var ph) ? ph.GetString() ?? "" : "" - }; - } - catch (JsonException ex) - { - logger.LogWarning(ex, "Failed to parse cache entry with id {Id}", id); - return null; - } - } - - private static string[]? ParseStringArray(JsonElement element) - { - if (element.ValueKind != JsonValueKind.Array) - return null; - - return element.EnumerateArray() - .Select(e => e.GetString()) - .Where(s => s is not null) - .Cast() - .ToArray(); - } -} - -/// -/// Document structure for the enrichment cache index. -/// Fields are stored directly for use with the enrich processor. -/// -public sealed record CacheIndexEntry -{ - [JsonPropertyName("enrichment_key")] - public required string EnrichmentKey { get; init; } - - /// - /// Document URL for debugging - helps identify which document this cache entry belongs to. - /// - [JsonPropertyName("url")] - public string? Url { get; init; } - - [JsonPropertyName("ai_rag_optimized_summary")] - public string? AiRagOptimizedSummary { get; init; } - - [JsonPropertyName("ai_short_summary")] - public string? AiShortSummary { get; init; } - - [JsonPropertyName("ai_search_query")] - public string? AiSearchQuery { get; init; } - - [JsonPropertyName("ai_questions")] - public string[]? AiQuestions { get; init; } - - [JsonPropertyName("ai_use_cases")] - public string[]? AiUseCases { get; init; } - - [JsonPropertyName("created_at")] - public required DateTimeOffset CreatedAt { get; init; } - - [JsonPropertyName("prompt_hash")] - public required string PromptHash { get; init; } -} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ElasticsearchLlmClient.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ElasticsearchLlmClient.cs deleted file mode 100644 index 46e70ab3f..000000000 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ElasticsearchLlmClient.cs +++ /dev/null @@ -1,455 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -using System.Security.Cryptography; -using System.Text; -using System.Text.Json; -using System.Text.Json.Serialization; -using Elastic.Transport; -using Elastic.Transport.Products.Elasticsearch; -using Microsoft.Extensions.Logging; - -namespace Elastic.Markdown.Exporters.Elasticsearch.Enrichment; - -/// -/// Elasticsearch inference API-backed implementation of . -/// Uses a semaphore to throttle concurrent LLM calls with exponential backoff on 429. -/// For large documents, uses hierarchical summarization (map-reduce) to handle content -/// that exceeds LLM context windows. -/// -public sealed class ElasticsearchLlmClient( - ITransport transport, - ILogger logger, - ElasticsearchOperations? operations = null, - int maxConcurrency = 10, - string inferenceEndpointId = ".gp-llm-v2-completion") : ILlmClient, IDisposable -{ - private readonly ITransport _transport = transport; - private readonly ILogger _logger = logger; - private readonly ElasticsearchOperations? _operations = operations; - private readonly SemaphoreSlim _throttle = new(maxConcurrency); - private readonly string _inferenceEndpointId = inferenceEndpointId; - - /// - /// Maximum body length in characters for direct enrichment. - /// Documents larger than this use hierarchical summarization. - /// Based on analysis: only ~2 docs exceed 400K stripped chars. - /// EIS uses Claude Sonnet models with 200K token (~800K char) context windows. - /// See: https://www.elastic.co/docs/explore-analyze/elastic-inference/eis - /// - private const int MaxBodyLength = 400_000; - - /// - /// Maximum chunk size for hierarchical summarization. - /// Actual chunk size is calculated dynamically for even distribution. - /// With prompt overhead (~10K), total is ~210K chars (~52K tokens), well under 200K token limit. - /// See: https://www.elastic.co/docs/explore-analyze/elastic-inference/eis - /// - private const int MaxChunkSize = 200_000; - - private static readonly Lazy PromptHashLazy = new(() => - { - // Include both prompts (with and without context) in hash so cache invalidates if any changes - var combinedPrompts = BuildEnrichmentPrompt("", "") + - BuildChunkSummaryPrompt("", "", 0, 0, null) + - BuildChunkSummaryPrompt("", "", 0, 0, "prev"); - var hash = SHA256.HashData(Encoding.UTF8.GetBytes(combinedPrompts)); - return Convert.ToHexString(hash).ToLowerInvariant(); - }); - - /// - /// Hash of the prompt templates. Changes when any prompt changes, triggering cache invalidation. - /// - public static string PromptHash => PromptHashLazy.Value; - - public async Task EnrichAsync(string title, string body, CancellationToken ct) - { - await _throttle.WaitAsync(ct); - try - { - // For small documents, enrich directly - if (body.Length <= MaxBodyLength) - return await CallEnrichmentAsync(title, body, ct); - - // For large documents, use hierarchical summarization - return await EnrichLargeDocumentAsync(title, body, ct); - } - finally - { - _ = _throttle.Release(); - } - } - - public void Dispose() => _throttle.Dispose(); - - /// - /// Hierarchical summarization for large documents (map-reduce): - /// 1. Split into chunks - /// 2. Summarize each chunk (map phase) - /// 3. Combine summaries and generate enrichment (reduce phase) - /// - private async Task EnrichLargeDocumentAsync(string title, string body, CancellationToken ct) - { - var chunks = SplitIntoChunks(body); - _logger.LogInformation( - "Using hierarchical summarization for large document ({Length} chars, {ChunkCount} chunks): {Title}", - body.Length, chunks.Count, title); - - // Map phase: summarize each chunk, passing previous summary as context - var chunkSummaries = new List(); - string? previousSummary = null; - - for (var i = 0; i < chunks.Count; i++) - { - var summary = await SummarizeChunkAsync(title, chunks[i], i + 1, chunks.Count, previousSummary, ct); - - // All-or-nothing: if any chunk fails, skip the entire document - if (string.IsNullOrWhiteSpace(summary)) - { - _logger.LogWarning( - "Chunk {ChunkNum}/{TotalCount} failed - skipping enrichment for: {Title}", - i + 1, chunks.Count, title); - return null; - } - - chunkSummaries.Add(summary); - previousSummary = summary; - } - - // Reduce phase: combine summaries and generate enrichment - var combinedSummary = string.Join("\n\n---\n\n", chunkSummaries); - _logger.LogDebug( - "Combined {SummaryCount} chunk summaries ({CombinedLength} chars) for enrichment: {Title}", - chunkSummaries.Count, combinedSummary.Length, title); - - return await CallEnrichmentAsync(title, combinedSummary, ct); - } - - /// - /// Splits document into chunks at paragraph boundaries. - /// Uses dynamic chunk sizing for even distribution while respecting MaxChunkSize. - /// - private static List SplitIntoChunks(string body) - { - // Calculate dynamic chunk size for even distribution - var numChunks = (int)Math.Ceiling((double)body.Length / MaxChunkSize); - var targetSize = (int)Math.Ceiling((double)body.Length / numChunks); - - var paragraphs = body.Split(["\n\n", "\r\n\r\n"], StringSplitOptions.RemoveEmptyEntries); - var chunks = new List(); - var currentParagraphs = new List(); - var currentSize = 0; - - void FlushCurrentChunk() - { - if (currentParagraphs.Count == 0) - return; - - chunks.Add(string.Join("\n\n", currentParagraphs)); - currentParagraphs.Clear(); - currentSize = 0; - } - - foreach (var paragraph in paragraphs) - { - var wouldExceedTarget = currentSize + paragraph.Length > targetSize; - - // Start new chunk if adding this paragraph would exceed target - if (currentParagraphs.Count > 0 && wouldExceedTarget) - FlushCurrentChunk(); - - currentParagraphs.Add(paragraph); - currentSize += paragraph.Length; - - // Immediately flush oversized paragraphs - if (currentSize > targetSize) - FlushCurrentChunk(); - } - - FlushCurrentChunk(); - - return chunks; - } - - /// - /// Summarizes a single chunk with a lightweight prompt. - /// Includes the previous chunk's summary as context for continuity. - /// - private async Task SummarizeChunkAsync( - string title, - string chunk, - int chunkNum, - int totalChunks, - string? previousSummary, - CancellationToken ct) - { - var prompt = BuildChunkSummaryPrompt(title, chunk, chunkNum, totalChunks, previousSummary); - var response = await CallInferenceAsync(prompt, ct); - - if (string.IsNullOrEmpty(response)) - { - _logger.LogWarning("Failed to summarize chunk {ChunkNum}/{TotalChunks} for: {Title}", - chunkNum, totalChunks, title); - return null; - } - - return response.Trim(); - } - - /// - /// Generates enrichment data from the document content. - /// - private async Task CallEnrichmentAsync(string title, string body, CancellationToken ct) - { - var prompt = BuildEnrichmentPrompt(title, body); - - _logger.LogDebug( - "Calling LLM for enrichment: {Title} (body: {BodyLength} chars, prompt: {PromptLength} chars)", - title, body.Length, prompt.Length); - - var response = await CallInferenceAsync(prompt, ct); - return ParseEnrichmentResponse(title, response); - } - - /// - /// Calls the inference API with retry logic and returns the raw text response. - /// - private async Task CallInferenceAsync(string prompt, CancellationToken ct) - { - var request = new InferenceRequest { Input = prompt }; - var requestBody = JsonSerializer.Serialize(request, EnrichmentSerializerContext.Default.InferenceRequest); - var url = $"_inference/completion/{_inferenceEndpointId}"; - var postData = PostData.String(requestBody); - - var response = _operations is not null - ? await _operations.WithRetryAsync( - () => _transport.PostAsync(url, postData, ct), - "inference", - ct) - : await _transport.PostAsync(url, postData, ct); - - if (response.ApiCallDetails.HasSuccessfulStatusCode) - return ExtractCompletionText(response.Body); - - var responsePreview = response.Body?.Length > 1000 - ? response.Body[..1000] + "..." - : response.Body; - - _logger.LogWarning( - "LLM inference failed: HTTP {StatusCode}. Prompt length: {PromptLength} chars. Response: {Response}", - response.ApiCallDetails.HttpStatusCode, prompt.Length, responsePreview); - return null; - } - - /// - /// Extracts the completion text from the inference API response. - /// - private string? ExtractCompletionText(string? responseBody) - { - if (string.IsNullOrEmpty(responseBody)) - return null; - - try - { - var completionResponse = JsonSerializer.Deserialize(responseBody, EnrichmentSerializerContext.Default.CompletionResponse); - return completionResponse?.Completion?.FirstOrDefault()?.Result; - } - catch (JsonException ex) - { - _logger.LogWarning("Failed to parse inference response: {Error}", ex.Message); - return null; - } - } - - /// - /// Parses the enrichment JSON from the LLM response. - /// - private EnrichmentData? ParseEnrichmentResponse(string title, string? responseText) - { - if (string.IsNullOrEmpty(responseText)) - { - _logger.LogWarning("Empty LLM response for enrichment: {Title}", title); - return null; - } - - try - { - var cleaned = CleanLlmResponse(responseText); - var result = JsonSerializer.Deserialize(cleaned, EnrichmentSerializerContext.Default.EnrichmentData); - - if (result is null || !result.HasData) - { - var responsePreview = cleaned.Length > 500 - ? cleaned[..500] + "..." - : cleaned; - _logger.LogWarning( - "LLM response parsed but has no data for {Title}. Response: {Response}", - title, responsePreview); - return null; - } - - _logger.LogDebug("Successfully enriched {Title}", title); - return result; - } - catch (JsonException ex) - { - var responsePreview = responseText.Length > 500 - ? responseText[..500] + "..." - : responseText; - _logger.LogWarning( - "Failed to parse LLM response for {Title}. Error: {Error}. Response: {Response}", - title, ex.Message, responsePreview); - return null; - } - } - - private static string CleanLlmResponse(string response) - { - var cleaned = response.Replace("```json", "").Replace("```", "").Trim(); - - // Fix common LLM issue: extra closing brace - if (cleaned.EndsWith("}}") && !cleaned.Contains("{{")) - cleaned = cleaned[..^1]; - - // Fix common LLM issue: trailing backticks from incomplete code block syntax - cleaned = cleaned.TrimEnd('`'); - - return cleaned; - } - - /// - /// Builds a lightweight prompt for summarizing a single chunk. - /// Includes previous summary for context continuity. - /// Output is plain text (not JSON) to minimize token usage. - /// - private static string BuildChunkSummaryPrompt( - string title, - string chunk, - int chunkNum, - int totalChunks, - string? previousSummary) - { - var contextSection = string.IsNullOrEmpty(previousSummary) - ? "" - : $""" - - - {previousSummary} - - - - Build on this context. Avoid repeating information already covered. Focus on new concepts introduced in this section. - - """; - - return $$""" - - Summarize this section of a technical document for Elastic documentation. - Focus on: API endpoints, methods, parameters, configuration options, and key technical concepts. - - - - - Output plain text only, no JSON, no markdown formatting - - Maximum 750 words - - Be concise but preserve all technical details - - - - Title: {{title}} - Section: {{chunkNum}} of {{totalChunks}} - - {{contextSection}} - - {{chunk}} - - """; - } - - /// - /// Builds the main enrichment prompt that generates the JSON metadata. - /// - private static string BuildEnrichmentPrompt(string title, string body) => - $$""" - - Expert technical writer creating search metadata for Elastic documentation (Elasticsearch, Kibana, Beats, Logstash). Audience: developers, DevOps, data engineers. - - - - Return a single valid JSON object. No markdown, no extra text, no trailing characters. - - - - { - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "required": ["ai_rag_optimized_summary", "ai_short_summary", "ai_search_query", "ai_questions", "ai_use_cases"], - "additionalProperties": false, - "properties": { - "ai_rag_optimized_summary": { - "type": "string", - "description": "3-5 sentences densely packed with technical entities for semantic vector matching. Include: API endpoint names, method names, parameter names, configuration options, data types, and core functionality. Write for RAG retrieval - someone asking 'how do I configure X' should match this text." - }, - "ai_short_summary": { - "type": "string", - "description": "Exactly 5-10 words for UI tooltip or search snippet. Action-oriented, starts with a verb. Example: 'Configure index lifecycle policies for data retention'" - }, - "ai_search_query": { - "type": "string", - "description": "3-8 keywords representing a realistic search query a developer would type. Include product name and key technical terms. Example: 'elasticsearch bulk api batch indexing'" - }, - "ai_questions": { - "type": "array", - "items": { "type": "string" }, - "minItems": 3, - "maxItems": 5, - "description": "Natural questions a dev would ask (6-15 words). Not too short, not too verbose. Examples: 'How do I bulk index documents?', 'What format does the bulk API use?', 'Why is my bulk request failing?'" - }, - "ai_use_cases": { - "type": "array", - "items": { "type": "string" }, - "minItems": 2, - "maxItems": 4, - "description": "Simple 2-4 word tasks a dev wants to do. Examples: 'index documents', 'check cluster health', 'enable TLS', 'fix slow queries', 'backup data'" - } - } - } - - - - - Extract ONLY from provided content. Never hallucinate APIs or features not mentioned. - - Be specific: 'configure index lifecycle policy' not 'manage data'. - - Avoid generic phrases: no 'comprehensive guide', 'powerful feature', 'easy to use'. - - Output exactly ONE opening brace and ONE closing brace. - - - - {"ai_rag_optimized_summary":"The Bulk API executes multiple index, create, delete, and update operations in a single NDJSON request. Endpoint: POST _bulk or POST /{index}/_bulk. Each action requires metadata line (index, create, update, delete) followed by optional document source. Supports parameters: routing, pipeline, refresh, require_alias. Returns per-operation results with _id, _version, result status, and error details for partial failures.","ai_short_summary":"Execute batch document operations in single request","ai_search_query":"elasticsearch bulk api batch index update delete","ai_questions":["How do I bulk index documents?","What format does the bulk API use?","How do I handle bulk operation errors?"],"ai_use_cases":["bulk index documents","batch update data","delete many docs"]} - - - - {{title}} - - {{body}} - - - """; -} - -public sealed record InferenceRequest -{ - [JsonPropertyName("input")] - public required string Input { get; init; } -} - -public sealed record CompletionResponse -{ - [JsonPropertyName("completion")] - public CompletionResult[]? Completion { get; init; } -} - -public sealed record CompletionResult -{ - [JsonPropertyName("result")] - public string? Result { get; init; } -} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichPolicyManager.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichPolicyManager.cs deleted file mode 100644 index 14d55c575..000000000 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichPolicyManager.cs +++ /dev/null @@ -1,177 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -using System.Security.Cryptography; -using System.Text; -using Elastic.Transport; -using Elastic.Transport.Products.Elasticsearch; -using Microsoft.Extensions.Logging; - -namespace Elastic.Markdown.Exporters.Elasticsearch.Enrichment; - -/// -/// Manages the Elasticsearch enrich policy and ingest pipeline for AI document enrichment. -/// Policy name is versioned based on the enrich fields, allowing seamless schema evolution. -/// -public sealed class EnrichPolicyManager( - DistributedTransport transport, - ILogger logger, - string cacheIndexName = "docs-ai-enriched-fields-cache") -{ - private readonly DistributedTransport _transport = transport; - private readonly ILogger _logger = logger; - private readonly string _cacheIndexName = cacheIndexName; - - /// - /// The fields included in the enrich policy. Changes here auto-version the policy. - /// - private static readonly string[] EnrichFields = - ["ai_rag_optimized_summary", "ai_short_summary", "ai_search_query", "ai_questions", "ai_use_cases", "prompt_hash"]; - - /// - /// Policy name includes a short hash of the fields for automatic versioning. - /// When fields change, a new policy is created without deleting the old one. - /// - public static string PolicyName { get; } = $"ai-enrichment-policy-{ComputeFieldsHash()}"; - - public const string PipelineName = "ai-enrichment-pipeline"; - - /// - /// Computes a short hash of the enrich fields for policy versioning. - /// - private static string ComputeFieldsHash() - { - var fieldsString = string.Join(",", EnrichFields); - var hashBytes = SHA256.HashData(Encoding.UTF8.GetBytes(fieldsString)); - return Convert.ToHexString(hashBytes)[..8].ToLowerInvariant(); - } - - /// - /// Generates the ingest pipeline body with the current policy name. - /// Pipeline uses URL-based matching to always find enrichment even when content changes. - /// - private static string GetIngestPipelineBody() => $$""" - { - "description": "Enriches documents with AI-generated fields from the enrichment cache (matched by URL)", - "processors": [ - { - "enrich": { - "policy_name": "{{PolicyName}}", - "field": "url", - "target_field": "ai_enrichment", - "max_matches": 1, - "ignore_missing": true - } - }, - { - "script": { - "description": "Flatten ai_enrichment fields to document root", - "if": "ctx.ai_enrichment != null", - "source": "if (ctx.ai_enrichment.ai_rag_optimized_summary != null) ctx.ai_rag_optimized_summary = ctx.ai_enrichment.ai_rag_optimized_summary; if (ctx.ai_enrichment.ai_short_summary != null) ctx.ai_short_summary = ctx.ai_enrichment.ai_short_summary; if (ctx.ai_enrichment.ai_search_query != null) ctx.ai_search_query = ctx.ai_enrichment.ai_search_query; if (ctx.ai_enrichment.ai_questions != null) ctx.ai_questions = ctx.ai_enrichment.ai_questions; if (ctx.ai_enrichment.ai_use_cases != null) ctx.ai_use_cases = ctx.ai_enrichment.ai_use_cases; if (ctx.ai_enrichment.prompt_hash != null) ctx.enrichment_prompt_hash = ctx.ai_enrichment.prompt_hash; ctx.remove('ai_enrichment');" - } - } - ] - } - """; - - /// - /// Ensures the enrich policy exists. Policy name is versioned based on fields, - /// so if the policy exists, it has the correct definition by design. - /// - public async Task EnsurePolicyExistsAsync(CancellationToken ct) - { - var existsResponse = await _transport.GetAsync($"_enrich/policy/{PolicyName}", ct); - - if (existsResponse.ApiCallDetails.HasSuccessfulStatusCode && - existsResponse.Body?.Contains(PolicyName) == true) - { - _logger.LogInformation("Enrich policy {PolicyName} already exists", PolicyName); - return; - } - - _logger.LogInformation("Creating enrich policy {PolicyName} for index {CacheIndex}...", PolicyName, _cacheIndexName); - - // Match by URL to always find enrichment even when document content changes - var enrichFieldsJson = string.Join(", ", EnrichFields.Select(f => $"\"{f}\"")); - var policyBody = $$""" - { - "match": { - "indices": "{{_cacheIndexName}}", - "match_field": "url", - "enrich_fields": [{{enrichFieldsJson}}] - } - } - """; - - var createResponse = await _transport.PutAsync( - $"_enrich/policy/{PolicyName}", - PostData.String(policyBody), - ct); - - if (createResponse.ApiCallDetails.HasSuccessfulStatusCode) - _logger.LogInformation("Created enrich policy {PolicyName}", PolicyName); - else - _logger.LogError("Failed to create enrich policy: {StatusCode} - {Response}", - createResponse.ApiCallDetails.HttpStatusCode, createResponse.Body); - } - - /// - /// Executes the enrich policy to rebuild the enrich index with latest data. - /// Call this after adding new entries to the cache index. - /// - public async Task ExecutePolicyAsync(CancellationToken ct) - { - // Verify policy exists before executing - var checkResponse = await _transport.GetAsync($"_enrich/policy/{PolicyName}", ct); - _logger.LogDebug("Pre-execute policy check: {StatusCode} - {Body}", - checkResponse.ApiCallDetails.HttpStatusCode, checkResponse.Body); - - if (!checkResponse.ApiCallDetails.HasSuccessfulStatusCode || - checkResponse.Body?.Contains(PolicyName) != true) - { - _logger.LogInformation("Policy {PolicyName} not found, creating...", PolicyName); - await EnsurePolicyExistsAsync(ct); - // Small delay for Serverless propagation - await Task.Delay(2000, ct); - } - - _logger.LogInformation("Executing enrich policy {PolicyName}...", PolicyName); - - var response = await _transport.PostAsync( - $"_enrich/policy/{PolicyName}/_execute", - PostData.Empty, - ct); - - if (response.ApiCallDetails.HasSuccessfulStatusCode) - _logger.LogInformation("Enrich policy executed successfully"); - else - _logger.LogWarning("Enrich policy execution failed (may be empty): {StatusCode} - {Response}", - response.ApiCallDetails.HttpStatusCode, response.Body); - } - - /// - /// Ensures the ingest pipeline exists with the current definition. - /// Always overwrites to pick up any script/processor changes. - /// - public async Task EnsurePipelineExistsAsync(CancellationToken ct) - { - // PUT is idempotent - always update to ensure pipeline definition is current - _logger.LogInformation("Creating/updating ingest pipeline {PipelineName} (using policy {PolicyName})...", PipelineName, PolicyName); - var createResponse = await _transport.PutAsync( - $"_ingest/pipeline/{PipelineName}", - PostData.String(GetIngestPipelineBody()), - ct); - - if (createResponse.ApiCallDetails.HasSuccessfulStatusCode) - _logger.LogInformation("Created/updated ingest pipeline {PipelineName}", PipelineName); - else - _logger.LogError("Failed to create ingest pipeline: {StatusCode} - {Response}", - createResponse.ApiCallDetails.HttpStatusCode, createResponse.Body); - } - - /// - /// Ensures the enrich policy exists. Pipeline is created separately in StartAsync. - /// - public async Task InitializeAsync(CancellationToken ct) => await EnsurePolicyExistsAsync(ct); -} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichmentKeyGenerator.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichmentKeyGenerator.cs deleted file mode 100644 index a22bbb44f..000000000 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichmentKeyGenerator.cs +++ /dev/null @@ -1,31 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -using System.Security.Cryptography; -using System.Text; -using System.Text.RegularExpressions; - -namespace Elastic.Markdown.Exporters.Elasticsearch.Enrichment; - -/// -/// Generates enrichment keys for AI enrichment cache lookups. -/// The key includes the prompt hash so that prompt changes trigger automatic cache invalidation. -/// -public static partial class EnrichmentKeyGenerator -{ - /// - /// Generates a content-based enrichment key (without prompt hash). - /// This allows stale enrichments from old prompts to still be applied when the prompt changes. - /// New enrichments will gradually replace stale ones as they're generated. - /// - public static string Generate(string title, string body) - { - var normalized = NormalizeRegex().Replace(title + body, "").ToLowerInvariant(); - var hash = SHA256.HashData(Encoding.UTF8.GetBytes(normalized)); - return Convert.ToHexString(hash).ToLowerInvariant(); - } - - [GeneratedRegex("[^a-zA-Z0-9]")] - private static partial Regex NormalizeRegex(); -} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichmentOptions.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichmentOptions.cs deleted file mode 100644 index c5e433dc3..000000000 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/EnrichmentOptions.cs +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -namespace Elastic.Markdown.Exporters.Elasticsearch.Enrichment; - -/// -/// Configuration options for document enrichment. -/// -public sealed record EnrichmentOptions -{ - /// - /// Whether enrichment is enabled. - /// - public bool Enabled { get; init; } - - /// - /// Maximum enrichments per run (new + stale refresh). Limits LLM calls to prevent long deployments. - /// Stale entries (with old prompt hash) are treated as non-existent and count toward this limit. - /// - public int MaxNewEnrichmentsPerRun { get; init; } = 100; - - /// - /// Maximum concurrent LLM calls. - /// - public int MaxConcurrentLlmCalls { get; init; } = 4; - - /// - /// Creates options with enrichment disabled. - /// - public static EnrichmentOptions Disabled => new() { Enabled = false }; -} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/IEnrichmentCache.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/IEnrichmentCache.cs deleted file mode 100644 index a57a34f10..000000000 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/IEnrichmentCache.cs +++ /dev/null @@ -1,42 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -namespace Elastic.Markdown.Exporters.Elasticsearch.Enrichment; - -/// -/// Abstraction for enrichment cache operations. -/// With the enrich processor pattern, the cache stores enrichment data that -/// gets joined to documents at index time via an Elasticsearch enrich processor. -/// -public interface IEnrichmentCache -{ - /// - /// The name of the cache index. - /// - string IndexName { get; } - - /// - /// Initializes the cache, including index creation and loading existing hashes. - /// - Task InitializeAsync(CancellationToken ct); - - /// - /// Checks if an enrichment exists for the given enrichment key. - /// - bool Exists(string enrichmentKey); - - /// - /// Stores enrichment data in the cache. - /// - /// The enrichment key (content hash). - /// The document URL for debugging. - /// The enrichment data to store. - /// Cancellation token. - Task StoreAsync(string enrichmentKey, string url, EnrichmentData data, CancellationToken ct); - - /// - /// Gets the number of entries currently in the cache. - /// - int Count { get; } -} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ILlmClient.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ILlmClient.cs deleted file mode 100644 index 5195f39d7..000000000 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/Enrichment/ILlmClient.cs +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -using System.Text.Json.Serialization; - -namespace Elastic.Markdown.Exporters.Elasticsearch.Enrichment; - -/// -/// Abstraction for LLM inference operations. -/// Enables swapping implementations and testing. -/// -public interface ILlmClient : IDisposable -{ - /// - /// Generates enrichment data for the given document content. - /// - /// The document title. - /// The document body content. - /// Cancellation token. - /// The enrichment data if successful, null otherwise. - Task EnrichAsync(string title, string body, CancellationToken ct); -} - -/// -/// AI-generated enrichment fields for a document. -/// -public sealed record EnrichmentData -{ - [JsonPropertyName("ai_rag_optimized_summary")] - public string? RagOptimizedSummary { get; init; } - - [JsonPropertyName("ai_short_summary")] - public string? ShortSummary { get; init; } - - [JsonPropertyName("ai_search_query")] - public string? SearchQuery { get; init; } - - [JsonPropertyName("ai_questions")] - public string[]? Questions { get; init; } - - [JsonPropertyName("ai_use_cases")] - public string[]? UseCases { get; init; } - - public bool HasData => - !string.IsNullOrEmpty(RagOptimizedSummary) || - !string.IsNullOrEmpty(ShortSummary) || - !string.IsNullOrEmpty(SearchQuery) || - Questions is { Length: > 0 } || - UseCases is { Length: > 0 }; -} - -[JsonSerializable(typeof(EnrichmentData))] -[JsonSerializable(typeof(InferenceRequest))] -[JsonSerializable(typeof(CompletionResponse))] -[JsonSerializable(typeof(CacheIndexEntry))] -internal sealed partial class EnrichmentSerializerContext : JsonSerializerContext; diff --git a/tests/Elastic.Markdown.Tests/Enrichment/ElasticsearchLlmClientTests.cs b/tests/Elastic.Markdown.Tests/Enrichment/ElasticsearchLlmClientTests.cs deleted file mode 100644 index a32e2719e..000000000 --- a/tests/Elastic.Markdown.Tests/Enrichment/ElasticsearchLlmClientTests.cs +++ /dev/null @@ -1,304 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -using System.Reflection; -using FluentAssertions; - -namespace Elastic.Markdown.Tests.Enrichment; - -/// -/// Tests for the chunking logic in ElasticsearchLlmClient. -/// These test the pure SplitIntoChunks function without needing network mocks. -/// -public class ChunkingTests -{ - // Use reflection to access the private static method - private static List SplitIntoChunks(string body) - { - var type = typeof(Elastic.Markdown.Exporters.Elasticsearch.Enrichment.ElasticsearchLlmClient); - var method = type.GetMethod("SplitIntoChunks", BindingFlags.NonPublic | BindingFlags.Static); - return (List)method!.Invoke(null, [body])!; - } - - [Fact] - public void SplitIntoChunks_SmallDocument_ReturnsSingleChunk() - { - // Arrange - document smaller than MaxChunkSize (200K) - var body = string.Join("\n\n", Enumerable.Range(1, 100).Select(i => $"Paragraph {i} content here.")); - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(1); - chunks[0].Should().Contain("Paragraph 1"); - chunks[0].Should().Contain("Paragraph 100"); - } - - [Fact] - public void SplitIntoChunks_500K_SplitsIntoFourChunks() - { - // Arrange - 50 paragraphs × 10K = ~500K chars - var paragraph = new string('x', 10_000); - var body = string.Join("\n\n", Enumerable.Range(1, 50).Select(_ => paragraph)); - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(4); - } - - [Fact] - public void SplitIntoChunks_1M_SplitsIntoSevenChunks() - { - // Arrange - 50 paragraphs × 20K = ~1M chars - // numChunks = ceil(1M / 200K) = 6, targetSize ~166K - // With 20K paragraphs: 8 fit per chunk, 50/8 = 6.25 → 7 chunks - var paragraph = new string('y', 20_000); - var body = string.Join("\n\n", Enumerable.Range(1, 50).Select(_ => paragraph)); - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(7); - } - - [Fact] - public void SplitIntoChunks_NoParagraphBreaks_ReturnsSingleChunk() - { - // Arrange - 300K chars with no paragraph breaks - var body = new string('z', 300_000); - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(1); - } - - [Fact] - public void SplitIntoChunks_PreservesAllContent() - { - // Arrange - 30 small paragraphs with unique identifiers - var paragraphs = Enumerable.Range(1, 30).Select(i => $"Paragraph {i} id=#{i}#").ToList(); - var body = string.Join("\n\n", paragraphs); - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(1); - foreach (var i in Enumerable.Range(1, 30)) - chunks[0].Should().Contain($"id=#{i}#"); - } - - [Fact] - public void SplitIntoChunks_FiltersEmptyParagraphs() - { - // Arrange - consecutive newlines create empty paragraphs that get filtered - var body = "First\n\n\n\nSecond\n\n\n\n\n\nThird"; - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(1); - chunks[0].Should().Be("First\n\nSecond\n\nThird"); - } - - [Fact] - public void SplitIntoChunks_250K_SplitsIntoTwoChunks() - { - // Arrange - 50 paragraphs × 5K = ~250K chars - var paragraph = new string('a', 5_000); - var body = string.Join("\n\n", Enumerable.Range(1, 50).Select(_ => paragraph)); - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(2); - } - - [Fact] - public void SplitIntoChunks_FinalFlush_CapturesRemainingContent() - { - // Arrange - content that doesn't trigger mid-loop flush - // but must be captured by final FlushCurrentChunk() call - var body = "First\n\nSecond\n\nThird"; - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - all content captured in final flush - chunks.Should().HaveCount(1); - chunks[0].Should().Contain("First"); - chunks[0].Should().Contain("Second"); - chunks[0].Should().Contain("Third"); - } - - // === Boundary tests around MaxChunkSize (200,000) === - // Note: MaxBodyLength is 400K (for direct enrichment), but MaxChunkSize is 200K (for chunking) - - [Fact] - public void SplitIntoChunks_Boundary_Minus100_SingleChunk() - { - // Arrange - 199,900 chars (100 below MaxChunkSize) - var body = new string('a', 199_900); - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(1); - } - - [Fact] - public void SplitIntoChunks_Boundary_Minus1_SingleChunk() - { - // Arrange - 199,999 chars (1 below MaxChunkSize) - var body = new string('b', 199_999); - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(1); - } - - [Fact] - public void SplitIntoChunks_Boundary_Exact_SingleChunk() - { - // Arrange - exactly 200,000 chars (at MaxChunkSize) - var body = new string('c', 200_000); - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(1); - } - - [Fact] - public void SplitIntoChunks_Boundary_Plus1_TwoChunks() - { - // Arrange - 200,001 chars (1 above MaxChunkSize) - // numChunks = ceil(200001 / 200000) = 2 - // With paragraphs, this splits into 2 chunks - var paragraph = new string('d', 100_000); - var body = $"{paragraph}\n\n{paragraph}a"; // 200,003 chars - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(2); - } - - [Fact] - public void SplitIntoChunks_Boundary_Plus100_TwoChunks() - { - // Arrange - 200,100 chars (100 above MaxChunkSize) - var paragraph = new string('e', 100_000); - var body = $"{paragraph}\n\n{paragraph}{new string('f', 98)}"; // 200,100 chars - - // Act - var chunks = SplitIntoChunks(body); - - // Assert - chunks.Should().HaveCount(2); - } - - // === Content integrity tests === - - [Fact] - public void SplitIntoChunks_ContentIntegrity_NothingLost() - { - // Arrange - paragraphs with unique markers - var paragraphs = Enumerable.Range(1, 100).Select(i => $"[P{i}]{new string('x', 5_000)}[/P{i}]").ToList(); - var body = string.Join("\n\n", paragraphs); - - // Act - var chunks = SplitIntoChunks(body); - var reassembled = string.Join("\n\n", chunks); - - // Assert - every paragraph marker present - foreach (var i in Enumerable.Range(1, 100)) - { - reassembled.Should().Contain($"[P{i}]"); - reassembled.Should().Contain($"[/P{i}]"); - } - } - - [Fact] - public void SplitIntoChunks_ContentIntegrity_NoDuplicates() - { - // Arrange - paragraphs with unique IDs - var paragraphs = Enumerable.Range(1, 50).Select(i => $"ID={i:D5}|{new string('y', 8_000)}").ToList(); - var body = string.Join("\n\n", paragraphs); - - // Act - var chunks = SplitIntoChunks(body); - var reassembled = string.Join("\n\n", chunks); - - // Assert - each ID appears exactly once - foreach (var i in Enumerable.Range(1, 50)) - { - var id = $"ID={i:D5}|"; - var count = CountOccurrences(reassembled, id); - count.Should().Be(1, $"ID {i} should appear exactly once"); - } - } - - [Fact] - public void SplitIntoChunks_ContentIntegrity_PreservesOrder() - { - // Arrange - numbered paragraphs - var paragraphs = Enumerable.Range(1, 60).Select(i => $"SEQ{i:D4}").ToList(); - var body = string.Join("\n\n", paragraphs); - - // Act - var chunks = SplitIntoChunks(body); - var reassembled = string.Join("\n\n", chunks); - - // Assert - sequence numbers appear in order - var lastIndex = -1; - foreach (var i in Enumerable.Range(1, 60)) - { - var marker = $"SEQ{i:D4}"; - var index = reassembled.IndexOf(marker, StringComparison.Ordinal); - index.Should().BeGreaterThan(lastIndex, $"SEQ{i} should come after SEQ{i - 1}"); - lastIndex = index; - } - } - - [Fact] - public void SplitIntoChunks_ContentIntegrity_ExactMatch() - { - // Arrange - small enough to fit in one chunk - var paragraphs = Enumerable.Range(1, 20).Select(i => $"Para {i}").ToList(); - var body = string.Join("\n\n", paragraphs); - - // Act - var chunks = SplitIntoChunks(body); - var reassembled = string.Join("\n\n", chunks); - - // Assert - exact match - reassembled.Should().Be(body); - } - - private static int CountOccurrences(string text, string pattern) - { - var count = 0; - var index = 0; - while ((index = text.IndexOf(pattern, index, StringComparison.Ordinal)) != -1) - { - count++; - index += pattern.Length; - } - return count; - } -}