Skip to content

Commit 4b68193

Browse files
committed
Added. html_parsing/ollama_com/dump_model_tags.py
1 parent 414eb2e commit 4b68193

7 files changed

Lines changed: 630 additions & 0 deletions
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
__author__ = "ipetrash"
5+
6+
7+
import json
8+
import re
9+
10+
from dataclasses import dataclass, asdict
11+
from pathlib import Path
12+
from typing import Self
13+
from urllib.parse import urljoin
14+
15+
import requests
16+
from bs4 import BeautifulSoup, Tag
17+
18+
from common import DIR_DUMPS, session
19+
20+
21+
FILE_NAME: Path = Path(__file__).resolve()
22+
23+
24+
@dataclass
25+
class Model:
26+
title: str
27+
url: str
28+
size: str
29+
context: str
30+
input: str
31+
id_hash: str
32+
last_modified: str
33+
34+
@classmethod
35+
def parse_from(cls, item: Tag, base_url: str) -> Self:
36+
first_div, second_div = item.find_all("div", recursive=False)
37+
38+
cell_name, cell_size, cell_context, cell_input = first_div.find_all(
39+
recursive=False
40+
)
41+
42+
id_hash, last_modified = re.split(r"·", second_div.get_text(strip=True))
43+
id_hash = id_hash.strip()
44+
last_modified = last_modified.strip()
45+
46+
return cls(
47+
title=cell_name.a.get_text(strip=True),
48+
url=urljoin(base_url, cell_name.a["href"]),
49+
size=cell_size.get_text(strip=True),
50+
context=cell_context.get_text(strip=True),
51+
input=cell_input.get_text(strip=True),
52+
id_hash=id_hash,
53+
last_modified=last_modified,
54+
)
55+
56+
57+
session = requests.Session()
58+
59+
60+
def process(name: str, file_name: Path):
61+
url: str = f"https://ollama.com/library/{name}/tags"
62+
print(f"Load: {url}")
63+
64+
rs = session.get(url)
65+
rs.raise_for_status()
66+
67+
soup = BeautifulSoup(rs.content, "html.parser")
68+
69+
items: list[Model] = []
70+
for item in soup.select("div.group > div:has(a)"):
71+
model = Model.parse_from(item, base_url=rs.url)
72+
print(model)
73+
items.append(model)
74+
75+
print(f"Writing to file: {FILE_NAME}")
76+
with open(file_name, "w", encoding="UTF-8") as f:
77+
for model in sorted(items, key=lambda x: x.title):
78+
f.write(json.dumps(asdict(model), ensure_ascii=False) + "\n")
79+
80+
81+
if __name__ == "__main__":
82+
import time
83+
84+
for name in [
85+
"qwen3.5",
86+
"qwen3-vl",
87+
"qwen3",
88+
"qwen",
89+
"bge-m3",
90+
"granite3.2-vision",
91+
]:
92+
path_dump: Path = DIR_DUMPS / f"{FILE_NAME.name}_{name}.jsonl"
93+
process(name=name, file_name=path_dump)
94+
print()
95+
96+
time.sleep(1)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"title": "bge-m3:567m", "url": "https://ollama.com/library/bge-m3:567m", "size": "1.2GB", "context": "8K", "input": "Text", "id_hash": "790764642607", "last_modified": "1 year ago"}
2+
{"title": "bge-m3:567m-fp16", "url": "https://ollama.com/library/bge-m3:567m-fp16", "size": "1.2GB", "context": "8K", "input": "Text", "id_hash": "790764642607", "last_modified": "1 year ago"}
3+
{"title": "bge-m3:latest", "url": "https://ollama.com/library/bge-m3:latest", "size": "1.2GB", "context": "8K", "input": "Text", "id_hash": "790764642607", "last_modified": "1 year ago"}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"title": "granite3.2-vision:2b", "url": "https://ollama.com/library/granite3.2-vision:2b", "size": "2.4GB", "context": "16K", "input": "Text, Image", "id_hash": "3be41a661804", "last_modified": "1 year ago"}
2+
{"title": "granite3.2-vision:2b-fp16", "url": "https://ollama.com/library/granite3.2-vision:2b-fp16", "size": "6.0GB", "context": "16K", "input": "Text, Image", "id_hash": "17ca6aa97bd9", "last_modified": "1 year ago"}
3+
{"title": "granite3.2-vision:2b-q4_K_M", "url": "https://ollama.com/library/granite3.2-vision:2b-q4_K_M", "size": "2.4GB", "context": "16K", "input": "Text, Image", "id_hash": "3be41a661804", "last_modified": "1 year ago"}
4+
{"title": "granite3.2-vision:2b-q8_0", "url": "https://ollama.com/library/granite3.2-vision:2b-q8_0", "size": "3.6GB", "context": "16K", "input": "Text, Image", "id_hash": "9b6204ce60f6", "last_modified": "1 year ago"}
5+
{"title": "granite3.2-vision:latest", "url": "https://ollama.com/library/granite3.2-vision:latest", "size": "2.4GB", "context": "16K", "input": "Text, Image", "id_hash": "3be41a661804", "last_modified": "1 year ago"}

0 commit comments

Comments
 (0)