-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmetadata.py
More file actions
221 lines (189 loc) · 8.41 KB
/
metadata.py
File metadata and controls
221 lines (189 loc) · 8.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import json
import base64
import hashlib
import os
import re
from typing import Optional, Dict, Tuple, Any
from datetime import datetime
from PIL import PngImagePlugin
def calculate_file_signature(filepath: str) -> str:
"""Вычисляем SHA256 checksum файла (первые 64КБ для скорости, если файл большой, или весь)"""
try:
h = hashlib.sha256()
with open(filepath, 'rb') as f:
# Читаем блоками
while True:
chunk = f.read(8192)
if not chunk:
break
h.update(chunk)
return h.hexdigest()
except Exception:
return ""
def calculate_content_hash(data: Dict) -> str:
"""
Calculates a semantic hash of the card content (core fields).
Used to detect manual changes vs format conversions.
"""
# Core fields that define the character's behavior
fields = [
"name",
"description",
"first_mes",
"personality",
"scenario",
"mes_example"
]
content_parts = []
for field in fields:
val = data.get(field, "")
if val is None:
val = ""
# Normalize: strip whitespace, unify newlines
val = str(val).strip().replace("\r\n", "\n")
content_parts.append(val)
# Join with a separator that is unlikely to appear in text
full_content = "|||".join(content_parts)
return hashlib.sha256(full_content.encode("utf-8")).hexdigest()
def extract_chub_info(data: Dict) -> Dict[str, str]:
"""
Extracts Chub specific metadata from extensions.
Returns dict with chub_id, chub_url.
"""
info = {
"chub_id": "",
"chub_url": "",
"chub_updated_at": ""
}
extensions = data.get("extensions", {})
if not isinstance(extensions, dict):
return info
chub_ext = extensions.get("chub", {})
if isinstance(chub_ext, dict):
full_path = chub_ext.get("full_path", "")
if full_path:
info["chub_id"] = str(full_path)
info["chub_url"] = f"https://chub.ai/characters/{full_path}"
# Try to find ID if path is missing (rare)
if not info["chub_id"] and chub_ext.get("id"):
# We prefer full_path as ID but integer ID is a fallback
info["chub_id"] = str(chub_ext.get("id"))
return info
def extract_hub_links(extensions: Any) -> list[str]:
"""Извлекаем ссылки на хабы из extensions"""
if not isinstance(extensions, dict):
return []
links = []
for key, val in extensions.items():
if key == "chub" and isinstance(val, dict):
full_path = val.get("full_path", "")
if full_path:
links.append(f"https://chub.ai/characters/{full_path}")
elif isinstance(val, dict):
for url_key in ["full_path", "url", "link", "source_url"]:
url_val = val.get(url_key)
if isinstance(url_val, str) and url_val.strip():
url_val = url_val.strip()
if not url_val.startswith("http"):
if "/" in url_val:
if key.lower() in ["janitor", "janitorai"]:
url_val = f"https://janitorai.com/characters/{url_val}"
elif key.lower() in ["character", "character_ai"]:
url_val = f"https://character.ai/chat/{url_val}"
links.append(url_val)
break
return links
def normalize_v2_data(data: Dict) -> Dict:
"""
Приводит структуру данных к плоскому виду для V2.
Если внутри есть ключ 'data', вытаскивает его содержимое на верхний уровень,
сохраняя spec/spec_version.
"""
if not isinstance(data, dict):
return {}
# Если это CCv3 или вложенный V2
if "data" in data and isinstance(data["data"], dict):
result = {
"spec": data.get("spec", "chara_card_v2"),
"spec_version": data.get("spec_version", "2.0")
}
result.update(data["data"])
# Сохраняем поля верхнего уровня, которых нет в data (кроме spec/data)
for k, v in data.items():
if k not in ["spec", "spec_version", "data"]:
result[k] = v
return result
return data
def read_png_metadata(filepath: str) -> Tuple[Optional[Dict], Optional[str]]:
"""Читает метаданные из PNG (chunks chara или ccv3)"""
try:
with PngImagePlugin.PngImageFile(filepath) as im:
text_chunks = im.text
# CCv3 имеет приоритет, но часто дублируется в chara.
# Проверяем ccv3
b64 = text_chunks.get('ccv3')
if b64:
try:
data = json.loads(base64.b64decode(b64).decode('utf-8'))
return normalize_v2_data(data), None
except Exception as e:
pass # Fallback to chara
b64 = text_chunks.get('chara')
if not b64:
return None, "No character card metadata found"
try:
data = json.loads(base64.b64decode(b64).decode('utf-8'))
return normalize_v2_data(data), None
except Exception as e:
return None, f"Decode error: {e}"
except Exception as e:
return None, str(e)
def read_json_metadata(filepath: str) -> Tuple[Optional[Dict], Optional[str]]:
"""Читает метаданные из JSON файла"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
# Валидация: это вообще карта?
# Проверяем обязательные поля для CCv2/V3 или наличие spec
normalized = normalize_v2_data(data)
# Минимальные требования: наличие имени
if not normalized.get("name"):
# Может это V1 формат? (без spec, просто плоский JSON)
# Если есть name и description/personality - считаем картой
if not (normalized.get("description") or normalized.get("personality")):
return None, "JSON does not look like a character card (missing name/description)"
return normalized, None
except json.JSONDecodeError:
return None, "Invalid JSON"
except Exception as e:
return None, str(e)
def read_card_metadata(filepath: str) -> Tuple[Optional[Dict], Optional[str]]:
"""Универсальная функция чтения (определяет по расширению)"""
ext = os.path.splitext(filepath)[1].lower()
if ext == ".png":
return read_png_metadata(filepath)
elif ext == ".json":
return read_json_metadata(filepath)
return None, f"Unsupported extension: {ext}"
def get_basic_index_info(meta: Dict) -> Tuple[str, list[str]]:
"""Извлекает автора и теги из нормализованных метаданных"""
creator = (meta.get("creator") or meta.get("author") or "").strip() or "Unknown"
t = meta.get("tags", [])
tags = []
if isinstance(t, list):
tags = [str(tag).strip() for tag in t if tag and str(tag).strip()]
elif isinstance(t, str) and t.strip():
tags = [tag.strip() for tag in t.split(",") if tag.strip()]
return creator, tags
def uniform_parse_date(date_str: str) -> str:
"""Преобразовать разные форматы даты в ISO8601 с миллисекундами"""
if not date_str or not isinstance(date_str, str):
return ""
# Пример: '2025-6-19 @09h 11m 49s 677ms'
pattern = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})\s*@?(\d{1,2})h\s*(\d{1,2})m\s*(\d{1,2})s\s*(\d{1,3})ms")
m = pattern.match(date_str)
if m:
yr, mon, d, h, mi, s, ms = m.groups()
dt = datetime(int(yr), int(mon), int(d), int(h), int(mi), int(s), int(ms)*1000)
return dt.isoformat(timespec='milliseconds')
return date_str