From c379d9bee657c8495e7ec5a4285e6ae5f716620c Mon Sep 17 00:00:00 2001 From: mbeaulne Date: Thu, 18 Jun 2026 13:58:41 -0400 Subject: [PATCH] Improve component search relevance and AI reranking --- src/services/componentSearchIndex.test.ts | 127 ++++++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/src/services/componentSearchIndex.test.ts b/src/services/componentSearchIndex.test.ts index 219dbe169..5cf7ce12b 100644 --- a/src/services/componentSearchIndex.test.ts +++ b/src/services/componentSearchIndex.test.ts @@ -593,6 +593,133 @@ describe("lexicalSearch", () => { expect(results.map((result) => result.digest)).toEqual(["target"]); }); + describe("search quality expectations", () => { + const qualityIndex = buildSearchIndex([ + makeSourced({ + digest: "train-test-split", + spec: { + name: "train_test_split", + description: "Split a dataset into train and test partitions.", + inputs: [{ name: "dataset", type: "Dataset" }], + outputs: [{ name: "train" }, { name: "test" }], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "train-model", + spec: { + name: "train_model", + description: "Fit a classifier on tabular data.", + inputs: [{ name: "table", type: "Dataset" }], + outputs: [{ name: "model", type: { artifact: "Model" } }], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "filter-rows", + spec: { + name: "filter_rows", + description: "Filter dataset rows with a boolean condition.", + inputs: [{ name: "dataset" }], + outputs: [{ name: "filtered_dataset" }], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "load-csv", + spec: { + name: "load_csv_file", + description: "Read a CSV file into a tabular dataframe.", + inputs: [{ name: "path", type: "String" }], + outputs: [{ name: "table", type: "Dataset" }], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "local-upload", + spec: { + name: "upload_file", + description: "Upload a file to a local directory.", + inputs: [{ name: "file" }], + outputs: [{ name: "path" }], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "gcs-upload", + spec: { + name: "upload_to_gcs", + description: "Upload a file to GCS cloud storage.", + inputs: [{ name: "file" }], + outputs: [{ name: "gcs_uri" }], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "predict-labels", + spec: { + name: "predict_labels", + description: "Infer labels from examples using a trained model.", + inputs: [{ name: "model" }, { name: "examples" }], + outputs: [{ name: "predictions" }], + implementation: { container: { image: "x" } }, + }, + }), + makeSourced({ + digest: "text-embeddings", + spec: { + name: "create_text_embeddings", + description: "Create vector embeddings for text documents.", + inputs: [{ name: "documents" }], + outputs: [{ name: "embeddings", type: "EmbeddingVector" }], + implementation: { container: { image: "x" } }, + }, + }), + ]); + + it.each([ + { + query: "split dataset into train and test", + expectedDigests: ["train-test-split"], + }, + { + query: "fit model on tabular data", + expectedDigests: ["train-model"], + }, + { + query: "read csv file", + expectedDigests: ["load-csv"], + }, + { + query: "filtr dataset rows", + expectedDigests: ["filter-rows"], + }, + { + query: "infer labels from model", + expectedDigests: ["predict-labels"], + }, + { + query: "make vector embeddings for text", + expectedDigests: ["text-embeddings"], + }, + { + query: "upload a file but not to GCS", + expectedDigests: ["local-upload"], + }, + ])( + "returns expected results for '$query'", + ({ query, expectedDigests }) => { + const results = lexicalSearch(qualityIndex, query).map( + (result) => result.digest, + ); + + expect(results.slice(0, expectedDigests.length)).toEqual( + expectedDigests, + ); + }, + ); + }); + it("does not special-case single-letter non-stop-word tokens", () => { const index = buildSearchIndex([ makeSourced({