aryn-ai · AbhijitP-009 · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025
diff --git a/Earnings_Call_Ingestion_Script.ipynb b/Earnings_Call_Ingestion_Script.ipynb
@@ -18,7 +18,14 @@
     "from sycamore.data.document import Document\n",
     "from sycamore.functions import HuggingFaceTokenizer, OpenAITokenizer\n",
     "from sycamore.llms import OpenAI, OpenAIModels\n",
+<<<<<<< HEAD
+    "from sycamore.transforms.embed import SentenceTransformerEmbedder\n",
+    "from aryn_sdk.client.client import Client\n",
+    "from sycamore.transforms.embed import OpenAIEmbedder\n",
+    "from sycamore import MaterializeSourceMode"
+=======
     "from sycamore.transforms.embed import SentenceTransformerEmbedder"
+>>>>>>> main
    ]
   },
   {
@@ -28,6 +35,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
+    "## Set your api-keys. You'll need an ARYN_API_KEY and an OPENAI_API_KEY\n",
+    "context = sycamore.init()\n",
+    "paths = \"<PUT_YOUR_PATH_HERE>\"\n",
+    "initial_docset = context.read.binary(paths, binary_format=\"pdf\")"
+=======
     "context = sycamore.init()\n",
     "# local file path to the SortBenchmark dataset\n",
     "paths = \"<PUT_YOUR_PATH_HERE>\"\n",
@@ -43,6 +56,7 @@
    "outputs": [],
    "source": [
     "## set your keys here"
+>>>>>>> main
    ]
   },
   {
@@ -109,9 +123,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
+    "partitioned_docset = (initial_docset.partition(partitioner=ArynPartitioner())\n",
+    "                      .materialize(path=\"PATH_TO_STORE_MATERIALIZED\", source_mode=MaterializeSourceMode.USE_STORED)\n",
+=======
     "from sycamore import MaterializeSourceMode\n",
     "partitioned_docset = (initial_docset.partition(partitioner=ArynPartitioner())\n",
     "                      .materialize(path=\"/Users/abhijitpujare/workspace/haystack-workshop-2025/materialize/partitioned_docset\", source_mode=MaterializeSourceMode.USE_STORED)\n",
+>>>>>>> main
     "                      .split_elements(tokenizer=tokenizer, max_tokens=512)\n",
     "                      .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, schema=schema_json, schema_name=\"earnings_calls\")))"
    ]
@@ -175,7 +194,10 @@
     "\n",
     "  return doc\n",
     "\n",
+<<<<<<< HEAD
+=======
     "#filtered_Docset.map_elements(markSpeakers).show()\n",
+>>>>>>> main
     "speakersMarkedDocSet = docset_no_orig_elements.map_elements(markSpeakers)"
    ]
   },
@@ -219,8 +241,12 @@
     "def filterOnlySpeakers(elem: Element):\n",
     "    return 'speaker' in elem.properties\n",
     "\n",
+<<<<<<< HEAD
+    "finalDocSet = mergedDialogeSet.filter_elements(filterOnlySpeakers)"
+=======
     "finalDocSet = mergedDialogeSet.filter_elements(filterOnlySpeakers)\n",
     "#finalDocSet.show()"
+>>>>>>> main
    ]
   },
   {
@@ -230,11 +256,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
+    "model_name = \"text-embedding-3-small\"\n",
+    "docset_name = \"<PUT_YOUR_DOCSET_NAME_HERE>\"\n",
+    "myClient = Client(aryn_url=\"https://test-api.aryn.ai\", aryn_api_key=\"\")\n",
+    "docset = myClient.create_docset(name=docset_name)"
+=======
     "from sycamore.transforms.embed import OpenAIEmbedder\n",
     "model_name = \"text-embedding-3-small\"\n",
     "from aryn_sdk.client.client import Client \n",
     "myClient = Client(aryn_url=\"https://test-api.aryn.ai\", aryn_api_key=\"\")\n",
     "docset = myClient.create_docset(name=\"haystack_workshop_target_correct\")"
+>>>>>>> main
    ]
   },
   {

diff --git a/QuestionAnsweringNotebook.ipynb b/QuestionAnsweringNotebook.ipynb
@@ -18,7 +18,14 @@
     "from sycamore.data.document import Document\n",
     "from sycamore.functions import HuggingFaceTokenizer, OpenAITokenizer\n",
     "from sycamore.llms import OpenAI, OpenAIModels\n",
+<<<<<<< HEAD
+    "from sycamore.transforms.embed import SentenceTransformerEmbedder\n",
+    "from sycamore.llms.prompts.default_prompts import LlmFilterMessagesJinjaPrompt\n",
+    "from sycamore.llms.prompts.prompts import JinjaPrompt\n",
+    "from sycamore.transforms.extract_entity import OpenAIEntityExtractor"
+=======
     "from sycamore.transforms.embed import SentenceTransformerEmbedder"
+>>>>>>> main
    ]
   },
   {
@@ -29,6 +36,9 @@
    "outputs": [],
    "source": [
     "context = sycamore.init()\n",
+<<<<<<< HEAD
+    "initial_docset = context.read.aryn(aryn_url=\"https://test-api.aryn.ai/v1/storage\", docset_id=\"<YOUR-DOCSET-ID-HERE>\", aryn_api_key=\"<YOUR-ARYN-API-KEY-HERE>\")"
+=======
     "initial_docset = context.read.aryn(aryn_url=\"https://test-api.aryn.ai/v1/storage\", docset_id=\"aryn:ds-v9tfacka0xifljqaj0l1rbh\", aryn_api_key=\"\")"
    ]
   },
@@ -58,6 +68,7 @@
    "outputs": [],
    "source": [
     "print(first_doc[0].properties['earnings_calls'])"
+>>>>>>> main
    ]
   },
   {
@@ -87,7 +98,11 @@
    "outputs": [],
    "source": [
     "from sycamore.llms import OpenAI, OpenAIModels\n",
+<<<<<<< HEAD
+    "oai = OpenAI(OpenAIModels.GPT_4O)"
+=======
     "oai = OpenAI(OpenAIModels.GPT_4O, api_key=\"<YOUR-KEY-HERE>\")"
+>>>>>>> main
    ]
   },
   {
@@ -97,6 +112,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
+    "## Cell to answer question: tell me the number of customers MongoDB had at the end of Q1\n",
+    "entity_extractor = OpenAIEntityExtractor(entity_name=\"num_customers\", llm=oai, num_of_elements=10,\n",
+    "                      field = \"text_representation\", use_elements=True)\n",
+    "                      \n",
+    "\n",
+    "mdb_docset = removed_orig_docset.filter( lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'MDB' and doc.properties['earnings_calls']['quarter']=='Q1').extract_entity(entity_extractor)"
+=======
     "from sycamore.llms.prompts.default_prompts import LlmFilterMessagesJinjaPrompt\n",
     "from sycamore.llms.prompts.prompts import JinjaPrompt\n",
     "from sycamore.transforms.extract_entity import OpenAIEntityExtractor\n",
@@ -129,6 +152,7 @@
     "                          '''\n",
     "#logical_node = LlmFilter(node_id=0, question=\"Filter all the records where the Brian Chesky spoke\", field=\"Brian Chesky\")\n",
     "#sycamore_operator = SycamoreLlmFilter(context, logical_node, query_id=\"test\", inputs=[exploded_docset])"
+>>>>>>> main
    ]
   },
   {
@@ -144,6 +168,127 @@
   {
    "cell_type": "code",
    "execution_count": null,
+<<<<<<< HEAD
+   "id": "fddbb1c1-4fe4-4aa1-a959-4c9e1e624e7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "avgo_docset = removed_orig_docset.filter( lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'AVGO').llm_filter(new_field=\"_autogen_LLMFilterOutput\",\n",
+    "            prompt=LlmFilterMessagesJinjaPrompt.set(filter_question=\"Does this mention the VMWare acquisition?\", use_elements=True),\n",
+    "            field = \"text_representation\",\n",
+    "            llm=oai,\n",
+    "            keep_none=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "887bdb16-1a49-4eb8-bb6d-e663aad23d43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Cell to answer the question tell me the first earnings call where the VMWare acquisiton was mentioned\n",
+    "\n",
+    "from sycamore.transforms import DateTimeStandardizer\n",
+    "\n",
+    "def filterVMware(elem: Element) -> bool:\n",
+    "  return \"VMware\" in elem.text_representation\n",
+    "\n",
+    "vwmare_docset_sorted = (removed_orig_docset\n",
+    "                        .filter(lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'AVGO')\n",
+    "                        .filter_elements(filterVMware)\n",
+    "                        .map(lambda doc: DateTimeStandardizer.standardize(doc, key_path = [\"properties\",\"earnings_calls\",\"date\"]))\n",
+    "                        .sort(descending=False, field=\"properties.earnings_calls.dateTime\"))\n",
+    "vwmare_docset_sorted.take(1)[0].properties['earnings_calls']['day']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d3de501-27cf-452a-8366-8d0ec60c307d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Cell to answer question summarize the impact of the VMWare acquisiton on Broadcom's earnings\n",
+    "from sycamore.llms.prompts.prompts import JinjaPrompt\n",
+    "from sycamore.llms.llms import LLMMode\n",
+    "myprompt = JinjaPrompt(\n",
+    "    system=\"You are a robot\",\n",
+    "    user=\"\"\"Here's an earnings call. Please answer the question {{ question }}\n",
+    "            {% for elt in doc.elements %}\n",
+    "            {{ elt.text_representation }}\n",
+    "            {% endfor %}\"\"\",\n",
+    "    question=\"Summarize the impact of the vmware acquisition on broadcom's earnings\"\n",
+    ")\n",
+    "\n",
+    "vmware_acquistion_summary = (removed_orig_docset.filter(lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'AVGO')\n",
+    "                     .filter_elements(filterVMware)\n",
+    "                     .llm_map(prompt=myprompt, output_field=\"acquisition_impact_summary\", llm=oai, llm_mode=LLMMode.ASYNC))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb56b73a-60f0-4d2b-9cb8-3aed4c0c72d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ls = []\n",
+    "for doc in vmware_acquistion_summary.take_all():\n",
+    "    ls.append((doc.properties['earnings_calls']['quarter'], doc.properties['acquisition_impact_summary']))\n",
+    "print(ls)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24fe8623-a63c-45ad-a973-8575fc23d57b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Cell to answer the question tell me how Intuit is integrating Intuit Assist (their new AI offering) into existing products\n",
+    "intuit_prompt = JinjaPrompt(\n",
+    "    system=\"You are a robot\",\n",
+    "    user=\"\"\"Here's an earnings call. Please answer the question {{ question }}\n",
+    "            {% for elt in doc.elements %}\n",
+    "            {{ elt.text_representation }}\n",
+    "            {% endfor %}\"\"\",\n",
+    "    question=\"Summarize how Intuit Assist is being integrated into Intuit's existing products\"\n",
+    ")\n",
+    "\n",
+    "def filterForAssist(elem: Element) -> bool:\n",
+    "  return \"Assist\" in elem.text_representation\n",
+    "\n",
+    "intuit_assist_summary = (removed_orig_docset.filter(lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'INTU')\n",
+    "                     .filter_elements(filterForAssist)\n",
+    "                     .llm_map(prompt=intuit_prompt, output_field=\"intuit_assist_summary\", llm=oai, llm_mode=LLMMode.ASYNC))\n",
+    "\n",
+    "ls = []\n",
+    "for doc in intuit_assist_summary.take_all():\n",
+    "    ls.append((doc.properties['earnings_calls']['quarter'], doc.properties['intuit_assist_summary']))\n",
+    "print(ls)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7c7d15d-dcfa-4bd6-a210-824aca1af220",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Cell to answer the question 'Return all the companies that mentioned inflation and return a count of the number of times inflation was mentioned'\n",
+    "inflation_mentioned = (exploded_docset2.filter(lambda element: 'inflation' in element.text_representation.lower())\n",
+    "                     .groupby_count(field='properties.earnings_calls.company_name'))\n",
+    "inflation_mentioned.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d66caeeb-cbe3-4f92-862f-33923012e1c6",
+   "metadata": {},
+   "outputs": [],
+=======
    "id": "192dc7b1-1598-479f-ab99-709d0c5c81d0",
    "metadata": {},
    "outputs": [],
@@ -167,6 +312,7 @@
    "id": "fddbb1c1-4fe4-4aa1-a959-4c9e1e624e7b",
    "metadata": {},
    "outputs": [],
+>>>>>>> main
    "source": []
   }
  ],

diff --git a/sycamore b/sycamore
+39 −0		.github/workflows/aryn-sdk_release.yml
+0 −12		docs/source/sycamore/APIs/low_level_transforms/summarize.rst
+253 −208		lib/sycamore/poetry.lock
+3 −3		lib/sycamore/pyproject.toml
+4 −11		lib/sycamore/sycamore/connectors/aryn/ArynWriter.py
+0 −4		lib/sycamore/sycamore/data/document.py
+4 −2		lib/sycamore/sycamore/docset.py
+2 −13		lib/sycamore/sycamore/functions/tokenizer.py
+0 −12		lib/sycamore/sycamore/llms/prompts/default_prompts.py
+4 −9		lib/sycamore/sycamore/llms/prompts/prompts.py
+125 −80		lib/sycamore/sycamore/query/execution/operations.py
+8 −7		lib/sycamore/sycamore/query/execution/sycamore_operator.py
+42 −6		lib/sycamore/sycamore/tests/integration/query/execution/test_operations.py
+0 −25		lib/sycamore/sycamore/tests/integration/transforms/test_partition.py
+3 −3		lib/sycamore/sycamore/tests/unit/llms/prompts/test_prompts.py
+2 −15		lib/sycamore/sycamore/tests/unit/query/execution/test_sycamore_executor.py
+4 −3		lib/sycamore/sycamore/tests/unit/query/execution/test_sycamore_operator.py
+75 −76		lib/sycamore/sycamore/tests/unit/query/test_operations.py
+3 −2		lib/sycamore/sycamore/tests/unit/test_docset.py
+6 −8		lib/sycamore/sycamore/tests/unit/transforms/test_llm_filter.py
+2 −344		lib/sycamore/sycamore/tests/unit/transforms/test_summarize.py
+0 −36		lib/sycamore/sycamore/tests/unit/utils/test_extract_json.py
+4 −22		lib/sycamore/sycamore/transforms/base.py
+2 −15		lib/sycamore/sycamore/transforms/detr_partitioner.py
+3 −3		lib/sycamore/sycamore/transforms/extract_entity.py
+7 −7		lib/sycamore/sycamore/transforms/extract_schema.py
+2 −2		lib/sycamore/sycamore/transforms/llm_filter.py
+0 −3		lib/sycamore/sycamore/transforms/partition.py
+128 −539		lib/sycamore/sycamore/transforms/summarize.py
+2 −2		lib/sycamore/sycamore/transforms/summarize_images.py
+1 −2		lib/sycamore/sycamore/writer.py
+115 −70		poetry.lock