gil9red
diff --git a/‎html_parsing/ollama_com/dump_model_tags.py‎
Lines changed: 96 additions & 0 deletions b/‎html_parsing/ollama_com/dump_model_tags.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎html_parsing/ollama_com/dumps/dump_model_tags.py_bge-m3.jsonl‎
Lines changed: 3 additions & 0 deletions b/‎html_parsing/ollama_com/dumps/dump_model_tags.py_bge-m3.jsonl‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎html_parsing/ollama_com/dumps/dump_model_tags.py_granite3.2-vision.jsonl‎
Lines changed: 5 additions & 0 deletions b/‎html_parsing/ollama_com/dumps/dump_model_tags.py_granite3.2-vision.jsonl‎
Lines changed: 5 additions & 0 deletions
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__author__ = "ipetrash"
+
+
+import json
+import re
+
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Self
+from urllib.parse import urljoin
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+from common import DIR_DUMPS, session
+
+
+FILE_NAME: Path = Path(__file__).resolve()
+
+
+@dataclass
+class Model:
+    title: str
+    url: str
+    size: str
+    context: str
+    input: str
+    id_hash: str
+    last_modified: str
+
+    @classmethod
+    def parse_from(cls, item: Tag, base_url: str) -> Self:
+        first_div, second_div = item.find_all("div", recursive=False)
+
+        cell_name, cell_size, cell_context, cell_input = first_div.find_all(
+            recursive=False
+        )
+
+        id_hash, last_modified = re.split(r"·", second_div.get_text(strip=True))
+        id_hash = id_hash.strip()
+        last_modified = last_modified.strip()
+
+        return cls(
+            title=cell_name.a.get_text(strip=True),
+            url=urljoin(base_url, cell_name.a["href"]),
+            size=cell_size.get_text(strip=True),
+            context=cell_context.get_text(strip=True),
+            input=cell_input.get_text(strip=True),
+            id_hash=id_hash,
+            last_modified=last_modified,
+        )
+
+
+session = requests.Session()
+
+
+def process(name: str, file_name: Path):
+    url: str = f"https://ollama.com/library/{name}/tags"
+    print(f"Load: {url}")
+
+    rs = session.get(url)
+    rs.raise_for_status()
+
+    soup = BeautifulSoup(rs.content, "html.parser")
+
+    items: list[Model] = []
+    for item in soup.select("div.group > div:has(a)"):
+        model = Model.parse_from(item, base_url=rs.url)
+        print(model)
+        items.append(model)
+
+    print(f"Writing to file: {FILE_NAME}")
+    with open(file_name, "w", encoding="UTF-8") as f:
+        for model in sorted(items, key=lambda x: x.title):
+            f.write(json.dumps(asdict(model), ensure_ascii=False) + "\n")
+
+
+if __name__ == "__main__":
+    import time
+
+    for name in [
+        "qwen3.5",
+        "qwen3-vl",
+        "qwen3",
+        "qwen",
+        "bge-m3",
+        "granite3.2-vision",
+    ]:
+        path_dump: Path = DIR_DUMPS / f"{FILE_NAME.name}_{name}.jsonl"
+        process(name=name, file_name=path_dump)
+        print()
+
+        time.sleep(1)
@@ -0,0 +1,3 @@
+{"title": "bge-m3:567m", "url": "https://ollama.com/library/bge-m3:567m", "size": "1.2GB", "context": "8K", "input": "Text", "id_hash": "790764642607", "last_modified": "1 year ago"}
+{"title": "bge-m3:567m-fp16", "url": "https://ollama.com/library/bge-m3:567m-fp16", "size": "1.2GB", "context": "8K", "input": "Text", "id_hash": "790764642607", "last_modified": "1 year ago"}
+{"title": "bge-m3:latest", "url": "https://ollama.com/library/bge-m3:latest", "size": "1.2GB", "context": "8K", "input": "Text", "id_hash": "790764642607", "last_modified": "1 year ago"}
@@ -0,0 +1,5 @@
+{"title": "granite3.2-vision:2b", "url": "https://ollama.com/library/granite3.2-vision:2b", "size": "2.4GB", "context": "16K", "input": "Text, Image", "id_hash": "3be41a661804", "last_modified": "1 year ago"}
+{"title": "granite3.2-vision:2b-fp16", "url": "https://ollama.com/library/granite3.2-vision:2b-fp16", "size": "6.0GB", "context": "16K", "input": "Text, Image", "id_hash": "17ca6aa97bd9", "last_modified": "1 year ago"}
+{"title": "granite3.2-vision:2b-q4_K_M", "url": "https://ollama.com/library/granite3.2-vision:2b-q4_K_M", "size": "2.4GB", "context": "16K", "input": "Text, Image", "id_hash": "3be41a661804", "last_modified": "1 year ago"}
+{"title": "granite3.2-vision:2b-q8_0", "url": "https://ollama.com/library/granite3.2-vision:2b-q8_0", "size": "3.6GB", "context": "16K", "input": "Text, Image", "id_hash": "9b6204ce60f6", "last_modified": "1 year ago"}
+{"title": "granite3.2-vision:latest", "url": "https://ollama.com/library/granite3.2-vision:latest", "size": "2.4GB", "context": "16K", "input": "Text, Image", "id_hash": "3be41a661804", "last_modified": "1 year ago"}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{"title": "bge-m3:567m", "url": "https://ollama.com/library/bge-m3:567m", "size": "1.2GB", "context": "8K", "input": "Text", "id_hash": "790764642607", "last_modified": "1 year ago"}`
	`2`	`+{"title": "bge-m3:567m-fp16", "url": "https://ollama.com/library/bge-m3:567m-fp16", "size": "1.2GB", "context": "8K", "input": "Text", "id_hash": "790764642607", "last_modified": "1 year ago"}`
	`3`	`+{"title": "bge-m3:latest", "url": "https://ollama.com/library/bge-m3:latest", "size": "1.2GB", "context": "8K", "input": "Text", "id_hash": "790764642607", "last_modified": "1 year ago"}`