|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +__author__ = "ipetrash" |
| 5 | + |
| 6 | + |
| 7 | +import json |
| 8 | +import re |
| 9 | + |
| 10 | +from dataclasses import dataclass, asdict |
| 11 | +from pathlib import Path |
| 12 | +from typing import Self |
| 13 | +from urllib.parse import urljoin |
| 14 | + |
| 15 | +import requests |
| 16 | +from bs4 import BeautifulSoup, Tag |
| 17 | + |
| 18 | +from common import DIR_DUMPS, session |
| 19 | + |
| 20 | + |
| 21 | +FILE_NAME: Path = Path(__file__).resolve() |
| 22 | + |
| 23 | + |
| 24 | +@dataclass |
| 25 | +class Model: |
| 26 | + title: str |
| 27 | + url: str |
| 28 | + size: str |
| 29 | + context: str |
| 30 | + input: str |
| 31 | + id_hash: str |
| 32 | + last_modified: str |
| 33 | + |
| 34 | + @classmethod |
| 35 | + def parse_from(cls, item: Tag, base_url: str) -> Self: |
| 36 | + first_div, second_div = item.find_all("div", recursive=False) |
| 37 | + |
| 38 | + cell_name, cell_size, cell_context, cell_input = first_div.find_all( |
| 39 | + recursive=False |
| 40 | + ) |
| 41 | + |
| 42 | + id_hash, last_modified = re.split(r"·", second_div.get_text(strip=True)) |
| 43 | + id_hash = id_hash.strip() |
| 44 | + last_modified = last_modified.strip() |
| 45 | + |
| 46 | + return cls( |
| 47 | + title=cell_name.a.get_text(strip=True), |
| 48 | + url=urljoin(base_url, cell_name.a["href"]), |
| 49 | + size=cell_size.get_text(strip=True), |
| 50 | + context=cell_context.get_text(strip=True), |
| 51 | + input=cell_input.get_text(strip=True), |
| 52 | + id_hash=id_hash, |
| 53 | + last_modified=last_modified, |
| 54 | + ) |
| 55 | + |
| 56 | + |
| 57 | +session = requests.Session() |
| 58 | + |
| 59 | + |
| 60 | +def process(name: str, file_name: Path): |
| 61 | + url: str = f"https://ollama.com/library/{name}/tags" |
| 62 | + print(f"Load: {url}") |
| 63 | + |
| 64 | + rs = session.get(url) |
| 65 | + rs.raise_for_status() |
| 66 | + |
| 67 | + soup = BeautifulSoup(rs.content, "html.parser") |
| 68 | + |
| 69 | + items: list[Model] = [] |
| 70 | + for item in soup.select("div.group > div:has(a)"): |
| 71 | + model = Model.parse_from(item, base_url=rs.url) |
| 72 | + print(model) |
| 73 | + items.append(model) |
| 74 | + |
| 75 | + print(f"Writing to file: {FILE_NAME}") |
| 76 | + with open(file_name, "w", encoding="UTF-8") as f: |
| 77 | + for model in sorted(items, key=lambda x: x.title): |
| 78 | + f.write(json.dumps(asdict(model), ensure_ascii=False) + "\n") |
| 79 | + |
| 80 | + |
| 81 | +if __name__ == "__main__": |
| 82 | + import time |
| 83 | + |
| 84 | + for name in [ |
| 85 | + "qwen3.5", |
| 86 | + "qwen3-vl", |
| 87 | + "qwen3", |
| 88 | + "qwen", |
| 89 | + "bge-m3", |
| 90 | + "granite3.2-vision", |
| 91 | + ]: |
| 92 | + path_dump: Path = DIR_DUMPS / f"{FILE_NAME.name}_{name}.jsonl" |
| 93 | + process(name=name, file_name=path_dump) |
| 94 | + print() |
| 95 | + |
| 96 | + time.sleep(1) |
0 commit comments