Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions Earnings_Call_Ingestion_Script.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@
"from sycamore.data.document import Document\n",
"from sycamore.functions import HuggingFaceTokenizer, OpenAITokenizer\n",
"from sycamore.llms import OpenAI, OpenAIModels\n",
<<<<<<< HEAD
"from sycamore.transforms.embed import SentenceTransformerEmbedder\n",
"from aryn_sdk.client.client import Client\n",
"from sycamore.transforms.embed import OpenAIEmbedder\n",
"from sycamore import MaterializeSourceMode"
=======
"from sycamore.transforms.embed import SentenceTransformerEmbedder"
>>>>>>> main
]
},
{
Expand All @@ -28,6 +35,12 @@
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
"## Set your api-keys. You'll need an ARYN_API_KEY and an OPENAI_API_KEY\n",
"context = sycamore.init()\n",
"paths = \"<PUT_YOUR_PATH_HERE>\"\n",
"initial_docset = context.read.binary(paths, binary_format=\"pdf\")"
=======
"context = sycamore.init()\n",
"# local file path to the SortBenchmark dataset\n",
"paths = \"<PUT_YOUR_PATH_HERE>\"\n",
Expand All @@ -43,6 +56,7 @@
"outputs": [],
"source": [
"## set your keys here"
>>>>>>> main
]
},
{
Expand Down Expand Up @@ -109,9 +123,14 @@
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
"partitioned_docset = (initial_docset.partition(partitioner=ArynPartitioner())\n",
" .materialize(path=\"PATH_TO_STORE_MATERIALIZED\", source_mode=MaterializeSourceMode.USE_STORED)\n",
=======
"from sycamore import MaterializeSourceMode\n",
"partitioned_docset = (initial_docset.partition(partitioner=ArynPartitioner())\n",
" .materialize(path=\"/Users/abhijitpujare/workspace/haystack-workshop-2025/materialize/partitioned_docset\", source_mode=MaterializeSourceMode.USE_STORED)\n",
>>>>>>> main
" .split_elements(tokenizer=tokenizer, max_tokens=512)\n",
" .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, schema=schema_json, schema_name=\"earnings_calls\")))"
]
Expand Down Expand Up @@ -175,7 +194,10 @@
"\n",
" return doc\n",
"\n",
<<<<<<< HEAD
=======
"#filtered_Docset.map_elements(markSpeakers).show()\n",
>>>>>>> main
"speakersMarkedDocSet = docset_no_orig_elements.map_elements(markSpeakers)"
]
},
Expand Down Expand Up @@ -219,8 +241,12 @@
"def filterOnlySpeakers(elem: Element):\n",
" return 'speaker' in elem.properties\n",
"\n",
<<<<<<< HEAD
"finalDocSet = mergedDialogeSet.filter_elements(filterOnlySpeakers)"
=======
"finalDocSet = mergedDialogeSet.filter_elements(filterOnlySpeakers)\n",
"#finalDocSet.show()"
>>>>>>> main
]
},
{
Expand All @@ -230,11 +256,18 @@
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
"model_name = \"text-embedding-3-small\"\n",
"docset_name = \"<PUT_YOUR_DOCSET_NAME_HERE>\"\n",
"myClient = Client(aryn_url=\"https://test-api.aryn.ai\", aryn_api_key=\"\")\n",
"docset = myClient.create_docset(name=docset_name)"
=======
"from sycamore.transforms.embed import OpenAIEmbedder\n",
"model_name = \"text-embedding-3-small\"\n",
"from aryn_sdk.client.client import Client \n",
"myClient = Client(aryn_url=\"https://test-api.aryn.ai\", aryn_api_key=\"\")\n",
"docset = myClient.create_docset(name=\"haystack_workshop_target_correct\")"
>>>>>>> main
]
},
{
Expand Down
146 changes: 146 additions & 0 deletions QuestionAnsweringNotebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@
"from sycamore.data.document import Document\n",
"from sycamore.functions import HuggingFaceTokenizer, OpenAITokenizer\n",
"from sycamore.llms import OpenAI, OpenAIModels\n",
<<<<<<< HEAD
"from sycamore.transforms.embed import SentenceTransformerEmbedder\n",
"from sycamore.llms.prompts.default_prompts import LlmFilterMessagesJinjaPrompt\n",
"from sycamore.llms.prompts.prompts import JinjaPrompt\n",
"from sycamore.transforms.extract_entity import OpenAIEntityExtractor"
=======
"from sycamore.transforms.embed import SentenceTransformerEmbedder"
>>>>>>> main
]
},
{
Expand All @@ -29,6 +36,9 @@
"outputs": [],
"source": [
"context = sycamore.init()\n",
<<<<<<< HEAD
"initial_docset = context.read.aryn(aryn_url=\"https://test-api.aryn.ai/v1/storage\", docset_id=\"<YOUR-DOCSET-ID-HERE>\", aryn_api_key=\"<YOUR-ARYN-API-KEY-HERE>\")"
=======
"initial_docset = context.read.aryn(aryn_url=\"https://test-api.aryn.ai/v1/storage\", docset_id=\"aryn:ds-v9tfacka0xifljqaj0l1rbh\", aryn_api_key=\"\")"
]
},
Expand Down Expand Up @@ -58,6 +68,7 @@
"outputs": [],
"source": [
"print(first_doc[0].properties['earnings_calls'])"
>>>>>>> main
]
},
{
Expand Down Expand Up @@ -87,7 +98,11 @@
"outputs": [],
"source": [
"from sycamore.llms import OpenAI, OpenAIModels\n",
<<<<<<< HEAD
"oai = OpenAI(OpenAIModels.GPT_4O)"
=======
"oai = OpenAI(OpenAIModels.GPT_4O, api_key=\"<YOUR-KEY-HERE>\")"
>>>>>>> main
]
},
{
Expand All @@ -97,6 +112,14 @@
"metadata": {},
"outputs": [],
"source": [
<<<<<<< HEAD
"## Cell to answer question: tell me the number of customers MongoDB had at the end of Q1\n",
"entity_extractor = OpenAIEntityExtractor(entity_name=\"num_customers\", llm=oai, num_of_elements=10,\n",
" field = \"text_representation\", use_elements=True)\n",
" \n",
"\n",
"mdb_docset = removed_orig_docset.filter( lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'MDB' and doc.properties['earnings_calls']['quarter']=='Q1').extract_entity(entity_extractor)"
=======
"from sycamore.llms.prompts.default_prompts import LlmFilterMessagesJinjaPrompt\n",
"from sycamore.llms.prompts.prompts import JinjaPrompt\n",
"from sycamore.transforms.extract_entity import OpenAIEntityExtractor\n",
Expand Down Expand Up @@ -129,6 +152,7 @@
" '''\n",
"#logical_node = LlmFilter(node_id=0, question=\"Filter all the records where the Brian Chesky spoke\", field=\"Brian Chesky\")\n",
"#sycamore_operator = SycamoreLlmFilter(context, logical_node, query_id=\"test\", inputs=[exploded_docset])"
>>>>>>> main
]
},
{
Expand All @@ -144,6 +168,127 @@
{
"cell_type": "code",
"execution_count": null,
<<<<<<< HEAD
"id": "fddbb1c1-4fe4-4aa1-a959-4c9e1e624e7b",
"metadata": {},
"outputs": [],
"source": [
"avgo_docset = removed_orig_docset.filter( lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'AVGO').llm_filter(new_field=\"_autogen_LLMFilterOutput\",\n",
" prompt=LlmFilterMessagesJinjaPrompt.set(filter_question=\"Does this mention the VMWare acquisition?\", use_elements=True),\n",
" field = \"text_representation\",\n",
" llm=oai,\n",
" keep_none=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "887bdb16-1a49-4eb8-bb6d-e663aad23d43",
"metadata": {},
"outputs": [],
"source": [
"## Cell to answer the question tell me the first earnings call where the VMWare acquisiton was mentioned\n",
"\n",
"from sycamore.transforms import DateTimeStandardizer\n",
"\n",
"def filterVMware(elem: Element) -> bool:\n",
" return \"VMware\" in elem.text_representation\n",
"\n",
"vwmare_docset_sorted = (removed_orig_docset\n",
" .filter(lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'AVGO')\n",
" .filter_elements(filterVMware)\n",
" .map(lambda doc: DateTimeStandardizer.standardize(doc, key_path = [\"properties\",\"earnings_calls\",\"date\"]))\n",
" .sort(descending=False, field=\"properties.earnings_calls.dateTime\"))\n",
"vwmare_docset_sorted.take(1)[0].properties['earnings_calls']['day']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5d3de501-27cf-452a-8366-8d0ec60c307d",
"metadata": {},
"outputs": [],
"source": [
"## Cell to answer question summarize the impact of the VMWare acquisiton on Broadcom's earnings\n",
"from sycamore.llms.prompts.prompts import JinjaPrompt\n",
"from sycamore.llms.llms import LLMMode\n",
"myprompt = JinjaPrompt(\n",
" system=\"You are a robot\",\n",
" user=\"\"\"Here's an earnings call. Please answer the question {{ question }}\n",
" {% for elt in doc.elements %}\n",
" {{ elt.text_representation }}\n",
" {% endfor %}\"\"\",\n",
" question=\"Summarize the impact of the vmware acquisition on broadcom's earnings\"\n",
")\n",
"\n",
"vmware_acquistion_summary = (removed_orig_docset.filter(lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'AVGO')\n",
" .filter_elements(filterVMware)\n",
" .llm_map(prompt=myprompt, output_field=\"acquisition_impact_summary\", llm=oai, llm_mode=LLMMode.ASYNC))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bb56b73a-60f0-4d2b-9cb8-3aed4c0c72d8",
"metadata": {},
"outputs": [],
"source": [
"ls = []\n",
"for doc in vmware_acquistion_summary.take_all():\n",
" ls.append((doc.properties['earnings_calls']['quarter'], doc.properties['acquisition_impact_summary']))\n",
"print(ls)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24fe8623-a63c-45ad-a973-8575fc23d57b",
"metadata": {},
"outputs": [],
"source": [
"## Cell to answer the question tell me how Intuit is integrating Intuit Assist (their new AI offering) into existing products\n",
"intuit_prompt = JinjaPrompt(\n",
" system=\"You are a robot\",\n",
" user=\"\"\"Here's an earnings call. Please answer the question {{ question }}\n",
" {% for elt in doc.elements %}\n",
" {{ elt.text_representation }}\n",
" {% endfor %}\"\"\",\n",
" question=\"Summarize how Intuit Assist is being integrated into Intuit's existing products\"\n",
")\n",
"\n",
"def filterForAssist(elem: Element) -> bool:\n",
" return \"Assist\" in elem.text_representation\n",
"\n",
"intuit_assist_summary = (removed_orig_docset.filter(lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'INTU')\n",
" .filter_elements(filterForAssist)\n",
" .llm_map(prompt=intuit_prompt, output_field=\"intuit_assist_summary\", llm=oai, llm_mode=LLMMode.ASYNC))\n",
"\n",
"ls = []\n",
"for doc in intuit_assist_summary.take_all():\n",
" ls.append((doc.properties['earnings_calls']['quarter'], doc.properties['intuit_assist_summary']))\n",
"print(ls)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c7c7d15d-dcfa-4bd6-a210-824aca1af220",
"metadata": {},
"outputs": [],
"source": [
"## Cell to answer the question 'Return all the companies that mentioned inflation and return a count of the number of times inflation was mentioned'\n",
"inflation_mentioned = (exploded_docset2.filter(lambda element: 'inflation' in element.text_representation.lower())\n",
" .groupby_count(field='properties.earnings_calls.company_name'))\n",
"inflation_mentioned.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d66caeeb-cbe3-4f92-862f-33923012e1c6",
"metadata": {},
"outputs": [],
=======
"id": "192dc7b1-1598-479f-ab99-709d0c5c81d0",
"metadata": {},
"outputs": [],
Expand All @@ -167,6 +312,7 @@
"id": "fddbb1c1-4fe4-4aa1-a959-4c9e1e624e7b",
"metadata": {},
"outputs": [],
>>>>>>> main
"source": []
}
],
Expand Down
2 changes: 1 addition & 1 deletion sycamore
Submodule sycamore updated 32 files
+39 −0 .github/workflows/aryn-sdk_release.yml
+0 −12 docs/source/sycamore/APIs/low_level_transforms/summarize.rst
+253 −208 lib/sycamore/poetry.lock
+3 −3 lib/sycamore/pyproject.toml
+4 −11 lib/sycamore/sycamore/connectors/aryn/ArynWriter.py
+0 −4 lib/sycamore/sycamore/data/document.py
+4 −2 lib/sycamore/sycamore/docset.py
+2 −13 lib/sycamore/sycamore/functions/tokenizer.py
+0 −12 lib/sycamore/sycamore/llms/prompts/default_prompts.py
+4 −9 lib/sycamore/sycamore/llms/prompts/prompts.py
+125 −80 lib/sycamore/sycamore/query/execution/operations.py
+8 −7 lib/sycamore/sycamore/query/execution/sycamore_operator.py
+42 −6 lib/sycamore/sycamore/tests/integration/query/execution/test_operations.py
+0 −25 lib/sycamore/sycamore/tests/integration/transforms/test_partition.py
+3 −3 lib/sycamore/sycamore/tests/unit/llms/prompts/test_prompts.py
+2 −15 lib/sycamore/sycamore/tests/unit/query/execution/test_sycamore_executor.py
+4 −3 lib/sycamore/sycamore/tests/unit/query/execution/test_sycamore_operator.py
+75 −76 lib/sycamore/sycamore/tests/unit/query/test_operations.py
+3 −2 lib/sycamore/sycamore/tests/unit/test_docset.py
+6 −8 lib/sycamore/sycamore/tests/unit/transforms/test_llm_filter.py
+2 −344 lib/sycamore/sycamore/tests/unit/transforms/test_summarize.py
+0 −36 lib/sycamore/sycamore/tests/unit/utils/test_extract_json.py
+4 −22 lib/sycamore/sycamore/transforms/base.py
+2 −15 lib/sycamore/sycamore/transforms/detr_partitioner.py
+3 −3 lib/sycamore/sycamore/transforms/extract_entity.py
+7 −7 lib/sycamore/sycamore/transforms/extract_schema.py
+2 −2 lib/sycamore/sycamore/transforms/llm_filter.py
+0 −3 lib/sycamore/sycamore/transforms/partition.py
+128 −539 lib/sycamore/sycamore/transforms/summarize.py
+2 −2 lib/sycamore/sycamore/transforms/summarize_images.py
+1 −2 lib/sycamore/sycamore/writer.py
+115 −70 poetry.lock
Loading