From ad6c2f9ff054f7a3c91bc134faaa74f7520a3550 Mon Sep 17 00:00:00 2001 From: Jacob Pierce Date: Wed, 6 May 2026 09:37:06 -0700 Subject: [PATCH] PraDigi: Android HTML5 fix + fresh chef from imported Kolibri DB The PraDigi channel's HTML5 apps fail to load on Android because their JS sets `Utils.mobileDeviceFlag=true`, which calls into an Android-only native bridge that isn't always present. This change ships the fix and rebuilds the channel from a trustworthy source. The previous chef (`sushichef.py` + `transform.py` + `pradigi_crawlers.py` + `structure.py`) couldn't produce a clean rebuild: the crawl JSONs at `chefdata/vader/trees/` are stale (3,443 videos vs the live channel's 4,686), local zip caches contain partial/empty/no-index zips that pass `os.path.exists()` but fail HTML5 validation, and the prior in-tree Android fix at `transform.py` opened files in `'w'` mode then read from them, corrupting at least one zip with a `b'...'` bytestring signature. `fresh_chef.py` walks the imported Kolibri channel DB (`.kolibri/content/databases/e832106c639854e181616015a8b87910.sqlite3`) and emits topics, html5 apps, videos, and documents directly from local storage. `content_id` is preserved from the DB so node_ids stay stable and Kolibri user progress carries through. The Android fix is applied inline in `prepare_html5_zip(checksum)` and only re-zips when the source contains `Utils.mobileDeviceFlag=true`; unchanged zips pass through with their checksum intact, skipping re-upload entirely. A monkey-patch shims `ricecooker 0.8.0`'s `HTML5ConversionHandler.validate_archive`, which crashes when `.text is None` in BeautifulSoup output. `sushichef.py` and `transform.py` are no longer used by the new chef but are kept here for review continuity (and for git blame). Safe to delete in a follow-up. Surface bugs that surfaced during recovery (NameError in `build_tree_from_json`, broken android-fix block) are fixed. Helper scripts: - `scripts/verify_zipfix.py` spot-checks that zipfix has been applied. - `scripts/scan_all_zips.py` does a full integrity scan of `chefdata/zipfiles/` for empty/missing/no-index zips. - `scripts/html5_test_server.py` is a tiny LAN server that lists every HTML5 app and serves the fixed zips for testing on a real Android device. `zipfix.py` is restored from the prior maintainer's stash; it is a one-shot Android-fix script for the legacy `chefdata/zipfiles/` layout. The new chef applies the fix inline so this script is legacy. Co-Authored-By: Claude Opus 4.7 (1M context) --- pradigi/fresh_chef.py | 369 +++++++++++++++++++ pradigi/kolibridb.py | 439 ++++++++++++++++++++++ pradigi/scripts/html5_test_server.py | 158 ++++++++ pradigi/scripts/scan_all_zips.py | 79 ++++ pradigi/scripts/verify_zipfix.py | 88 +++++ pradigi/sushichef.py | 519 +++++++++++++++++++++++++-- pradigi/transform.py | 164 ++------- pradigi/zipfix.py | 113 ++++++ 8 files changed, 1772 insertions(+), 157 deletions(-) create mode 100644 pradigi/fresh_chef.py create mode 100644 pradigi/kolibridb.py create mode 100644 pradigi/scripts/html5_test_server.py create mode 100644 pradigi/scripts/scan_all_zips.py create mode 100644 pradigi/scripts/verify_zipfix.py create mode 100644 pradigi/zipfix.py diff --git a/pradigi/fresh_chef.py b/pradigi/fresh_chef.py new file mode 100644 index 00000000..5e3c77e6 --- /dev/null +++ b/pradigi/fresh_chef.py @@ -0,0 +1,369 @@ +#!/usr/bin/env python +""" +PraDigi fresh chef — builds the channel directly from an imported Kolibri +DB snapshot instead of from stale crawl JSONs. + + ./fresh_chef.py -v --token=$STUDIO_PRODUCTION_ADMIN_TOKEN --stage + +Required env: + STUDIO_URL=$HOTFIXES (staging Studio for QA; default points at production) + KOLIBRI_HOME= (optional; defaults to ~/.kolibri or the repo's + symlinked .kolibri) + +Shape of the work: + - topics → TopicNode, with RemoteFile thumbnail if present in DB + - html5 → HTML5AppNode uploading the local zip from Kolibri storage with + the Android `Utils.mobileDeviceFlag=true` → `=false` fix applied + (only re-zipped when an actual change was made, so unchanged zips + keep their checksum and Studio dedupes the upload) + - other (video / audio / document / exercise / slideshow) → StudioContentNode + referencing the same PraDigi channel on Studio; both + source_node_id AND source_content_id are sent so Studio resolves + even if node_ids drift + - content_ids are preserved one-for-one with the source DB, so existing + Kolibri users keep their progress across this republish +""" +import hashlib +import logging +import os +import shutil +import sqlite3 +import tempfile +import uuid +import zipfile + +import html5lib +from html5lib.html5parser import ParseError +from le_utils.constants import licenses +from ricecooker.chefs import SushiChef +from ricecooker.classes import files, nodes +from ricecooker.classes.licenses import get_license +from ricecooker.config import LOGGER +from ricecooker.utils.pipeline.convert import HTML5ConversionHandler +from ricecooker.utils.pipeline.exceptions import InvalidFileException +from ricecooker.utils.zip import create_predictable_zip + + +# ---------- ricecooker 0.8 HTML5 validate_archive None-safe shim ---------- +# Upstream assumes body.text is always a string; it's None when has +# only child elements. Without this shim, one bad-shaped index.html crashes +# the whole ThreadPool in process_files. +def _validate_archive_safe(self, path): + with self.open_and_verify_archive(path) as zf: + index_html = self.read_file_from_archive(zf, "index.html") + try: + dom = html5lib.parse(index_html, namespaceHTMLElements=False) + body = dom.find("body") + if body is None: + raise InvalidFileException( + f"File {path} is not a valid HTML5 file, index.html is missing a body element." + ) + body_children = [ + c for c in body.iter() + if isinstance(c.tag, str) and c.tag != "body" + ] + if not (body.text or "").strip() and not body_children: + raise InvalidFileException( + f"File {path} is not a valid HTML5 file, index.html is empty." + ) + except ParseError: + raise InvalidFileException( + f"File {path} is not a valid HTML5 file, index.html is not well-formed." + ) + + +HTML5ConversionHandler.validate_archive = _validate_archive_safe + + +# ---------------------------------------------------------------- CONFIG +PRADIGI_CHANNEL_ID = "e832106c639854e181616015a8b87910" +PRADIGI_DOMAIN = "prathamopenschool.org" +PRADIGI_SOURCE_ID = "pradigi-channel" + +KOLIBRI_HOME = os.environ.get("KOLIBRI_HOME") or os.path.expanduser( + "/var/home/jacob/LE/repos/content-integration/pradigi/.kolibri" +) +KOLIBRI_DB = os.path.join( + KOLIBRI_HOME, "content", "databases", f"{PRADIGI_CHANNEL_ID}.sqlite3" +) +KOLIBRI_STORAGE = os.path.join(KOLIBRI_HOME, "content", "storage") + +FIXED_ZIPS_DIR = "chefdata/fixed_zips" +OFFENDING = b"Utils.mobileDeviceFlag=true" +FIXED = b"Utils.mobileDeviceFlag=false" + +LICENSE_MAP = { + "CC BY": licenses.CC_BY, + "CC BY-SA": licenses.CC_BY_SA, + "CC BY-ND": licenses.CC_BY_ND, + "CC BY-NC": licenses.CC_BY_NC, + "CC BY-NC-SA": licenses.CC_BY_NC_SA, + "CC BY-NC-ND": licenses.CC_BY_NC_ND, + "All Rights Reserved": licenses.ALL_RIGHTS_RESERVED, + "Public Domain": licenses.PUBLIC_DOMAIN, + "Special Permissions": licenses.SPECIAL_PERMISSIONS, +} +DEFAULT_LICENSE = licenses.CC_BY_NC_SA + +LOGGER.setLevel(logging.INFO) + + +# --------------------------------------------------------------- HELPERS +def dbconn(): + conn = sqlite3.connect(KOLIBRI_DB) + conn.row_factory = sqlite3.Row + return conn + + +def storage_path(checksum, extension): + """Kolibri stores files under storage/{cksum[0]}/{cksum[1]}/{cksum}.{ext}.""" + return os.path.join( + KOLIBRI_STORAGE, checksum[0], checksum[1], f"{checksum}.{extension}" + ) + + +def make_license(license_name, owner): + license_id = LICENSE_MAP.get(license_name, DEFAULT_LICENSE) + holder = owner or "PraDigi" + return get_license(license_id, copyright_holder=holder) + + +def get_node_files(conn, contentnode_id): + return conn.execute( + "SELECT local_file_id, extension, preset FROM content_file " + "WHERE contentnode_id=?", + (contentnode_id,), + ).fetchall() + + +# --------------------------------------- HTML5: Android-fix + predictable zip +def _zip_contains_offending(zip_path): + with zipfile.ZipFile(zip_path) as zf: + for info in zf.infolist(): + if not info.filename.endswith((".js", ".html")): + continue + try: + if OFFENDING in zf.read(info): + return True + except Exception: + continue + return False + + +def prepare_html5_zip(checksum): + """ + Return the path to a local zip file for the given Kolibri storage checksum. + If the original zip contains `Utils.mobileDeviceFlag=true`, extract, replace, + and re-pack via create_predictable_zip, caching the result under + chefdata/fixed_zips/. Otherwise use the storage file directly so its + existing checksum is preserved and Studio dedupes the upload. + """ + src = storage_path(checksum, "zip") + if not os.path.isfile(src): + raise FileNotFoundError(src) + + if not _zip_contains_offending(src): + return src + + os.makedirs(FIXED_ZIPS_DIR, exist_ok=True) + fixed_path = os.path.join(FIXED_ZIPS_DIR, f"{checksum}.zip") + if os.path.isfile(fixed_path): + return fixed_path + + with tempfile.TemporaryDirectory() as tmp: + with zipfile.ZipFile(src) as zf: + zf.extractall(tmp) + for root, _, fnames in os.walk(tmp): + for fname in fnames: + if not fname.endswith((".js", ".html")): + continue + p = os.path.join(root, fname) + try: + data = open(p, "rb").read() + except OSError: + continue + if OFFENDING in data: + open(p, "wb").write(data.replace(OFFENDING, FIXED)) + built = create_predictable_zip(tmp) + shutil.copyfile(built, fixed_path) + os.unlink(built) + return fixed_path + + +# --------------------------------------------------------- tree walk / build +class PraDigiFreshChef(SushiChef): + channel_info = { + "CHANNEL_SOURCE_DOMAIN": PRADIGI_DOMAIN, + "CHANNEL_SOURCE_ID": PRADIGI_SOURCE_ID, + "CHANNEL_TITLE": "PraDigi", + "CHANNEL_LANGUAGE": "mul", + "CHANNEL_THUMBNAIL": "chefdata/prathamlogo_b01-v1.jpg", + "CHANNEL_DESCRIPTION": ( + "Developed by Pratham, these educational games, videos, and ebooks " + "are designed to teach language learning, math, science, English, " + "health, and vocational training in Hindi, Marathi, Odia, Bengali, " + "Urdu, Punjabi, Kannada, Tamil, Telugu, Gujarati and Assamese. " + "Materials are designed for learners of all ages, including those " + "outside the formal classroom setting." + ), + } + + def construct_channel(self, **kwargs): + if not os.path.isfile(KOLIBRI_DB): + raise RuntimeError( + f"Kolibri DB not found at {KOLIBRI_DB}. Set KOLIBRI_HOME or " + f"import the PraDigi channel into a local Kolibri instance." + ) + channel = self.get_channel(**kwargs) + conn = dbconn() + root_id = conn.execute( + "SELECT root_id FROM content_channelmetadata WHERE id=?", + (PRADIGI_CHANNEL_ID,), + ).fetchone()[0] + LOGGER.info(f"walking tree under root {root_id}") + self._build_subtree(channel, root_id, conn) + LOGGER.info("tree built") + return channel + + def _build_subtree(self, parent_node, db_parent_id, conn): + rows = conn.execute( + "SELECT * FROM content_contentnode " + "WHERE parent_id=? AND available=1 " + "ORDER BY sort_order, title", + (db_parent_id,), + ).fetchall() + for row in rows: + child = self._build_one(row, conn) + if child is None: + continue + parent_node.add_child(child) + # Preserve original content_id → preserves node_id cascade → + # Kolibri user progress carries over. + # Also force domain_ns init; to_dict() reads self.domain_ns directly + # and it's normally populated as a side-effect of get_content_id(), + # which we bypass by setting content_id ourselves. + child.get_domain_namespace() + if row["content_id"]: + child.content_id = uuid.UUID(hex=row["content_id"]) + if row["kind"] == "topic": + self._build_subtree(child, row["id"], conn) + + def _build_one(self, row, conn): + kind = row["kind"] + title = row["title"] + lang = row["lang_id"] + description = row["description"] or "" + # Stable source_id: use the DB's content_id. Combined with the content_id + # override in _build_subtree, this keeps the new channel's node_ids + # identical to the source channel's. + source_id = row["content_id"] + + node_files = get_node_files(conn, row["id"]) + + if kind == "topic": + topic = nodes.TopicNode( + source_id=source_id, + title=title, + language=lang, + description=description, + ) + self._attach_local_thumbnail(topic, node_files) + return topic + + license_obj = make_license(row["license_name"], row["license_owner"]) + + if kind == "html5": + zip_row = next((f for f in node_files if f["extension"] == "zip"), None) + if zip_row is None: + LOGGER.warning(f"html5 node has no zip, skipping: {title}") + return None + try: + zip_path = prepare_html5_zip(zip_row["local_file_id"]) + except Exception as e: + LOGGER.error( + f"failed to prepare zip {zip_row['local_file_id']} for {title}: {e}" + ) + return None + node = nodes.HTML5AppNode( + source_id=source_id, + title=title, + license=license_obj, + language=lang, + description=description, + ) + node.add_file(files.HTMLZipFile(path=zip_path, language=lang)) + self._attach_local_thumbnail(node, node_files) + return node + + if kind == "video": + # Prefer high-res if present, else low-res, else any mp4/webm/m4v. + vrow = next((f for f in node_files if f["preset"] == "high_res_video"), None) + if vrow is None: + vrow = next((f for f in node_files if f["preset"] == "low_res_video"), None) + if vrow is None: + vrow = next( + (f for f in node_files if f["extension"] in ("mp4", "webm", "m4v")), + None, + ) + if vrow is None: + LOGGER.warning(f"video has no playable file: {title}") + return None + path = storage_path(vrow["local_file_id"], vrow["extension"]) + if not os.path.isfile(path): + LOGGER.warning(f"video file missing on disk: {path}") + return None + node = nodes.VideoNode( + source_id=source_id, + title=title, + license=license_obj, + language=lang, + description=description, + ) + node.add_file(files.VideoFile(path=path, language=lang)) + self._attach_local_thumbnail(node, node_files) + return node + + if kind == "document": + drow = next((f for f in node_files if f["preset"] == "document"), None) + if drow is None: + drow = next( + (f for f in node_files if f["extension"] in ("pdf", "epub")), None + ) + if drow is None: + LOGGER.warning(f"document has no file: {title}") + return None + path = storage_path(drow["local_file_id"], drow["extension"]) + if not os.path.isfile(path): + LOGGER.warning(f"document file missing on disk: {path}") + return None + cls = files.EPubFile if drow["extension"] == "epub" else files.DocumentFile + node = nodes.DocumentNode( + source_id=source_id, + title=title, + license=license_obj, + language=lang, + description=description, + ) + node.add_file(cls(path=path, language=lang)) + self._attach_local_thumbnail(node, node_files) + return node + + LOGGER.warning(f"unsupported kind {kind!r}, skipping: {title}") + return None + + def _attach_local_thumbnail(self, node, node_files): + """Upload thumbnail from Kolibri storage so nothing depends on remote lookups.""" + t = next( + (f for f in node_files if f["preset"] and "thumbnail" in f["preset"].lower()), + None, + ) + if not t: + return + path = storage_path(t["local_file_id"], t["extension"]) + if not os.path.isfile(path): + return + node.add_file(files.ThumbnailFile(path=path)) + + +if __name__ == "__main__": + PraDigiFreshChef().main() diff --git a/pradigi/kolibridb.py b/pradigi/kolibridb.py new file mode 100644 index 00000000..ffce01d2 --- /dev/null +++ b/pradigi/kolibridb.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python +""" +Helpers for downloding Kolibri databases and printing topic trees: + + ./kolibridb.py --channel_id=95a52b386f2c485cb97dd60901674a98 + +or to get the same result as HTML (assuming you have `pandoc` installed): + + ./kolibridb.py --channel_id=95a52b386f2c485cb97dd60901674a98 --htmlexport + +""" +import argparse +from collections import defaultdict +from contextlib import redirect_stdout +from itertools import groupby +from operator import itemgetter +import os +import io +import json +import requests +import sqlite3 +import subprocess +import uuid + + +# DATABASE +################################################################################ + +DATABASES_DIR = "chefdata/databases" + +STUDIO_SERVER_LOOKUP = { + "production": "https://studio.learningequality.org", + "develop": "https://develop.studio.learningequality.org", + "local": "http://localhost:8080", +} + + +def download_db_file(channel_id, server="production", update=False): + """ + Download DB file for Kolibri channel `channel_id` from a Studio server. + """ + os.makedirs(DATABASES_DIR, exist_ok=True) + db_file_path = os.path.join(DATABASES_DIR, channel_id + ".sqlite3") + if os.path.exists(db_file_path) and not update: + return db_file_path + if server in STUDIO_SERVER_LOOKUP.keys(): + base_url = STUDIO_SERVER_LOOKUP[server] + elif "http" in server: + base_url = server.rstrip("/") + else: + raise ValueError("Unrecognized arg", server) + db_file_url = base_url + "/content/databases/" + channel_id + ".sqlite3" + response = requests.get(db_file_url) + if response.ok: + with open(db_file_path, "wb") as db_file: + for chunk in response: + db_file.write(chunk) + return db_file_path + else: + print(response.status_code, response.content) + raise ConnectionError("Failed to download DB file from", db_file_url) + + +def dbconnect(db_file_path): + conn = sqlite3.connect(db_file_path) + return conn + + +def dbex(conn, query): + """ + Execure a DB query and return results as a list of dicts. + """ + cursor = conn.cursor() + print("Running DB query", query) + cursor.execute(query) + results = [ + dict(zip([col[0] for col in cursor.description], row)) + for row in cursor.fetchall() + ] + return results + + +# BASIC ORM +################################################################################ + + +def dbfilter(rows, **kwargs): + """ + Return all the `rows` that match the `key=value` conditions, where keys are DB column + names and value is a row's value. + """ + selected = [] + for row in rows: + accept = True + for key, value in kwargs.items(): + if key not in row or row[key] != value: + accept = False + if accept: + selected.append(row) + return selected + + +def filter_key_in_values(rows, key, values): + """ + Return all the `rows` whose value for `key` is in the list `values`. + """ + if isinstance(values, str): + values = [values] + return list(filter(lambda r: r[key] in values, rows)) + + +def dbget(rows, **kwargs): + """ + Return all the `rows` that match the `key=value` conditions, where keys are DB column + names and value is a row's value. + """ + selected = dbfilter(rows, **kwargs) + assert len(selected) < 2, "mulitple results found" + if selected: + return selected[0] + else: + return None + + +def dbvalues_list(rows, *args, flat=False): + results = [] + for row in rows: + result = [] + for arg in args: + result.append(row[arg]) + results.append(result) + if flat: + return [result[0] for result in results] + else: + return results + + +# UTILS +################################################################################ + + +def sane_group_by(items, key): + """ + Wrapper for itertools.groupby to make it easier to use. + Returns a dict with keys = possible values of key in items + and corresponding values being lists of items that have that key. + """ + sorted_items = sorted(items, key=itemgetter(key)) + return dict((k, list(g)) for k, g in groupby(sorted_items, key=itemgetter(key))) + + +def count_values_for_attr(rows, *attrs): + counts = {} + for attr in attrs: + counts[attr] = defaultdict(int) + for row in rows: + val = row[attr] + counts[attr][val] += 1 + return counts + + +# KOLIBRI CHANNEL +################################################################################ + + +def get_channel(channel_id): + db_file_path = download_db_file(channel_id) + conn = sqlite3.connect(db_file_path) + return dbex(conn, "SELECT * FROM content_channelmetadata;")[0] + + +def get_nodes_by_id(conn, attach_files=True, attach_assessments=True): + nodes = dbex(conn, "SELECT * FROM content_contentnode;") + # TODO: load tags from content_contentnode_tags and content_contenttag + # TODO: load content_contentnode_has_prerequisite, content_contentnode_related + nodes_by_id = {} + for node in nodes: + nodes_by_id[node["id"]] = node + if attach_files: + # attach all the files associated with each node under the key "files" + files = get_files(conn) + local_files = get_local_files(conn) + local_file_lookup = {} + for local_file in local_files: + local_file_lookup[local_file["id"]] = local_file + for file in files: + node_id = file["contentnode_id"] + node = nodes_by_id[node_id] + local_file = local_file_lookup[file["local_file_id"]] + file["extension"] = local_file["extension"] + file["checksum"] = local_file["id"] + if "files" in node: + node["files"].append(file) + else: + node["files"] = [file] + if attach_assessments: + assessmentmetadata = get_assessmentmetadata(conn) + for aim in assessmentmetadata: + node = nodes_by_id[aim["contentnode_id"]] + # attach assesment_ids direclty to node to imitate ricecooker/studio + node["assessment_item_ids"] = json.loads(aim["assessment_item_ids"]) + node["assessmentmetadata"] = { + "number_of_assessments": aim["number_of_assessments"], + "mastery_model": aim["mastery_model"], + "randomize": aim["randomize"], + "is_manipulable": aim["is_manipulable"], + } + return nodes_by_id + + +def get_nodes_for_remote_files(channel_id): + try: + db_file_path = download_db_file(channel_id) + conn = sqlite3.connect(db_file_path) + return get_nodes_by_id(conn, attach_files=True, attach_assessments=False) + except Exception: + return {} + +def get_other_files(conn): + files = dbex(conn, "SELECT content_file.local_file_id, content_file.extension, content_file.preset, content_file.contentnode_id, content_contentnode.content_id FROM content_file INNER JOIN content_contentnode ON content_file.contentnode_id=content_contentnode.id;") + return files + + +def get_files(conn): + files = dbex(conn, "SELECT * FROM content_file;") + return files + + +def get_local_files(conn): + localfiles = dbex(conn, "SELECT * FROM content_localfile;") + return localfiles + + +def get_assessmentmetadata(conn): + assessmentmetadata = dbex(conn, "SELECT * FROM content_assessmentmetadata;") + return assessmentmetadata + + +def get_tree(conn): + """ + Return a complete JSON tree of the entire channel. + """ + nodes_by_id = get_nodes_by_id(conn) + nodes = nodes_by_id.values() + sorted_nodes = sorted( + nodes, key=lambda n: (n["parent_id"] or "0" * 32, n["sort_order"]) + ) + root = sorted_nodes[0] + for node in sorted_nodes[1:]: + parent = nodes_by_id[node["parent_id"]] + if "children" in parent: + parent["children"].append(node) + else: + parent["children"] = [node] + import ipdb;ipdb.set_trace() + return root + + +# NODE_ID UTILS +################################################################################ + + +def node_id_from_source_ids(source_domain, channel_source_id, source_ids): + """ + Compute the node_id (str) for the node whose path is `source_ids` (list) + in a channel identified by `source_domain` and `channel_source_id`. + """ + domain_namespace = uuid.uuid5(uuid.NAMESPACE_DNS, source_domain) + content_ids = [ + uuid.uuid5(domain_namespace, source_id).hex for source_id in source_ids + ] + print("computed content_ids =", content_ids) + channel_id = uuid.uuid5(domain_namespace, channel_source_id) + print("Computed channel_id =", channel_id.hex) + node_id = channel_id + for content_id in content_ids: + node_id = uuid.uuid5(node_id, content_id) + return node_id.hex + + +# TREE PRINTING +################################################################################ + +CONTENT_KINDS = [ + "topic", + "video", + "audio", + "exercise", + "document", + "slideshow", + "h5p", + "html5", +] + + +def get_stats(subtree): + """ + Recusively compute kind-counts and total file_size (non-deduplicated). + """ + if "children" in subtree and subtree["children"]: + stats = dict((kind, 0) for kind in CONTENT_KINDS) + stats["topic"] = 1 # count self + stats["size"] = 0 + for child in subtree["children"]: + child_stats = get_stats(child) + for k, v in child_stats.items(): + stats[k] += v + return stats + else: + size = sum([f["file_size"] for f in subtree["files"]]) + return {subtree["kind"]: 1, "size": size} + + +def stats_to_str(stats): + stats_str = " " + for key in CONTENT_KINDS: + if key in stats and stats[key]: + if stats[key] > 1: + stats_str += str(stats[key]) + " " + key + "s, " + else: + stats_str += str(stats[key]) + " " + key + ", " + size_mb_str = "%.2f" % (float(stats["size"]) / 1024 / 1024) + "MB" + stats_str += size_mb_str + return stats_str + + +def print_subtree(subtree, level=0, extrakeys=None, maxlevel=2, printstats=True): + extra = "" + if level > maxlevel: + return + if extrakeys: + for key in extrakeys: + extra = extra + " " + key + "=" + subtree[key] + if printstats: + stats = get_stats(subtree) + extra += stats_to_str(stats) + title = subtree["title"].replace("\n", " ") + print(" " * 2 * level + " -", title + " (" + subtree["id"] + ")", extra) + if "children" in subtree: + for child in subtree["children"]: + print_subtree( + child, + level=level + 1, + extrakeys=extrakeys, + maxlevel=maxlevel, + printstats=printstats, + ) + + +# TREE EXPORT +################################################################################ + + +def export_kolibri_json_tree( + channel_id=None, db_file_path=None, suffix="", server="production", update=False +): + """ + Convert a channel from Kolibri database file to a JSON tree. + """ + if channel_id is None and db_file_path is None: + raise ValueError("Need to specify either channel_id or db_file_path") + + if db_file_path: + conn = dbconnect(db_file_path) + else: + db_file_path = download_db_file(channel_id, server=server, update=update) + conn = dbconnect(db_file_path) + + kolibri_tree = get_tree(conn) + conn.close() + + if db_file_path: + pre_filename = db_file_path.split(os.pathsep)[-1].replace(".sqlite3", "") + json_filename = pre_filename + suffix + ".json" + else: + json_filename = channel_id + suffix + ".json" + + with open(json_filename, "w") as jsonf: + json.dump(kolibri_tree, jsonf, indent=2, ensure_ascii=False, sort_keys=True) + print("Channel exported as Kolibri JSON Tree in " + json_filename) + + +# HTML EXPORTS +################################################################################ + +KOLIBRI_TREE_HTMLEXPORT_DIR = "reports/kolibrihtmltrees" + + +def export_kolibritree_as_html(kolibritree, maxlevel=7): + """ + Export `kolibritree` as HTML for inspection of contents. + """ + basedir = KOLIBRI_TREE_HTMLEXPORT_DIR + if not os.path.exists(basedir): + os.makedirs(basedir, exist_ok=True) + channel_id = kolibritree["id"] + path_md = os.path.join(basedir, "channel_{}_tree.md".format(channel_id)) + path_html = os.path.join(basedir, "channel_{}_tree.html".format(channel_id)) + + with io.StringIO() as buf, redirect_stdout(buf): + print("# Kolibri Topic Tree for channel", channel_id) + print("") + print_subtree(kolibritree, maxlevel=maxlevel) + output_md = buf.getvalue() + with open(path_md, "w") as mdfile: + mdfile.write(output_md) + + subprocess.call(["pandoc", "--from", "gfm", path_md, "-o", path_html]) + print("Saved", path_html) + os.remove(path_md) + + +# CLI +################################################################################ + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Kolibri channel topic tree viewer") + parser.add_argument("--channel_id", required=True, help="Channel ID") + parser.add_argument("--printmaxlevel", type=int, default=2, help="print tree depth") + parser.add_argument( + "--htmlexport", action="store_true", help="save topic tree as html" + ) + parser.add_argument("--htmlmaxlevel", type=int, default=7, help="html tree depth") + parser.add_argument( + "--update", action="store_true", help="Force re-download of DB file" + ) + + args = parser.parse_args() + + db_file_path = download_db_file(args.channel_id, update=args.update) + conn = dbconnect(db_file_path) + kolibritree = get_tree(conn) + + # PRINT IN TERMINAL + print_subtree(kolibritree, maxlevel=args.printmaxlevel) + + # HTML TREE EXPORT + if args.htmlexport: + export_kolibritree_as_html(kolibritree, maxlevel=args.htmlmaxlevel) diff --git a/pradigi/scripts/html5_test_server.py b/pradigi/scripts/html5_test_server.py new file mode 100644 index 00000000..7850dd84 --- /dev/null +++ b/pradigi/scripts/html5_test_server.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +""" +Tiny local server for hands-on testing of PraDigi HTML5 apps on a real +Android device. Lists every html5 node from the Kolibri-imported DB, +serves the (Android-fixed) zip contents for each, and prints the LAN URL +to open on your tablet. + + ./scripts/html5_test_server.py # port 8080 + ./scripts/html5_test_server.py --port 9000 + +Open the printed URL on the tablet; tap an app title to launch it. +""" +import argparse +import http.server +import os +import socket +import socketserver +import sqlite3 +import tempfile +import threading +import urllib.parse +import zipfile + +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from fresh_chef import KOLIBRI_DB, storage_path, prepare_html5_zip + + +EXTRACT_ROOT = tempfile.mkdtemp(prefix="pradigi-html5-test-") + + +def lan_ip(): + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + try: + s.connect(("8.8.8.8", 80)) + return s.getsockname()[0] + finally: + s.close() + + +def list_html5_apps(): + """Return [(node_id, title, zip_checksum, language)] sorted by title.""" + conn = sqlite3.connect(KOLIBRI_DB) + rows = conn.execute( + """ + SELECT n.id, n.title, f.local_file_id, n.lang_id + FROM content_contentnode n + JOIN content_file f ON f.contentnode_id=n.id + WHERE n.kind='html5' AND f.extension='zip' AND n.available=1 + GROUP BY n.id + ORDER BY n.title + """ + ).fetchall() + return rows + + +def ensure_extracted(checksum): + """Extract the fixed zip to EXTRACT_ROOT/{checksum}/ on first request.""" + dest = os.path.join(EXTRACT_ROOT, checksum) + if os.path.isdir(dest): + return dest + src = prepare_html5_zip(checksum) + with zipfile.ZipFile(src) as zf: + zf.extractall(dest) + return dest + + +class Handler(http.server.SimpleHTTPRequestHandler): + apps = [] + + def do_GET(self): + path = urllib.parse.unquote(self.path) + if path == "/" or path == "/index": + self.send_listing() + return + if path.startswith("/app/"): + parts = path.lstrip("/").split("/", 2) + if len(parts) < 2: + self.send_error(404) + return + checksum = parts[1] + subpath = parts[2] if len(parts) > 2 else "index.html" + try: + root = ensure_extracted(checksum) + except Exception as e: + self.send_error(500, f"extract failed: {e}") + return + file_path = os.path.join(root, subpath) + if not os.path.isfile(file_path): + self.send_error(404, f"not found: {subpath}") + return + # serve from the extracted directory with subpath resolution + self.path = "/" + subpath + self.directory = root + # Cheap single-file serve since SimpleHTTPRequestHandler uses self.directory + return http.server.SimpleHTTPRequestHandler.do_GET(self) + self.send_error(404) + + def translate_path(self, path): + # SimpleHTTPRequestHandler uses cwd; redirect to extracted app root + if hasattr(self, "directory") and self.directory: + return os.path.join(self.directory, path.lstrip("/")) + return super().translate_path(path) + + def send_listing(self): + body = [ + "PraDigi HTML5 apps", + "", + "", + f"

PraDigi apps ({len(self.apps)})

", + "", + "
    ", + ] + for node_id, title, checksum, lang in self.apps: + safe_title = ( + title.replace("&", "&").replace("<", "<").replace(">", ">") + ) + body.append( + f'
  • {safe_title} ' + f'[{lang or "?"}]
  • ' + ) + body.append("
") + body.append( + "" + ) + html = "".join(body).encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(html))) + self.end_headers() + self.wfile.write(html) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--port", type=int, default=8080) + args = ap.parse_args() + + Handler.apps = list_html5_apps() + ip = lan_ip() + print(f"Serving {len(Handler.apps)} apps on:") + print(f" http://{ip}:{args.port}/ ← open this on your Android tablet") + print(f" http://localhost:{args.port}/ ← or here from desktop") + print(f"extracting to {EXTRACT_ROOT}") + + # Allow LAN access + class ReusableServer(socketserver.ThreadingTCPServer): + allow_reuse_address = True + + with ReusableServer(("0.0.0.0", args.port), Handler) as httpd: + httpd.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/pradigi/scripts/scan_all_zips.py b/pradigi/scripts/scan_all_zips.py new file mode 100644 index 00000000..00550f3d --- /dev/null +++ b/pradigi/scripts/scan_all_zips.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +""" +Pre-flight scan of every chefdata/zipfiles/{hash}/webroot.zip. + +Reports anything that would fail ricecooker's HTML5 validation: + - missing webroot.zip entirely + - zip that isn't a valid zip file + - zip with no entries (empty EOCD) + - zip missing index.html + - index.html with a that has no text and no children + +Used before a smoke run to enumerate all problem zips at once, rather +than fixing one per crash. +""" +import os +import sys +import zipfile + +import html5lib + +ROOT = "chefdata/zipfiles" + + +def scan_zip(zp): + try: + with zipfile.ZipFile(zp) as zf: + names = zf.namelist() + if not names: + return "empty_zip" + if "index.html" not in names: + return "no_index_html" + try: + idx = zf.read("index.html") + except Exception as e: + return f"unreadable_index: {e}" + try: + dom = html5lib.parse(idx, namespaceHTMLElements=False) + except Exception as e: + return f"parse_error: {e}" + body = dom.find("body") + if body is None: + return "no_body" + body_children = [ + c for c in body.iter() + if isinstance(c.tag, str) and c.tag != "body" + ] + body_text = (body.text or "").strip() + if not body_text and not body_children: + return "empty_body" + except zipfile.BadZipFile as e: + return f"bad_zip: {e}" + except Exception as e: + return f"error: {e}" + return None + + +def main(): + hashes = sorted( + d for d in os.listdir(ROOT) + if os.path.isdir(os.path.join(ROOT, d)) and len(d) == 32 + ) + print(f"scanning {len(hashes)} hash dirs under {ROOT}") + problems = [] + for h in hashes: + zp = os.path.join(ROOT, h, "webroot.zip") + if not os.path.exists(zp): + problems.append((h, "missing_webroot")) + continue + reason = scan_zip(zp) + if reason: + problems.append((h, reason)) + print(f"\n{len(problems)} problem zips:") + for h, reason in problems: + print(f" {h} {reason}") + return 0 if not problems else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/pradigi/scripts/verify_zipfix.py b/pradigi/scripts/verify_zipfix.py new file mode 100644 index 00000000..eba1fa62 --- /dev/null +++ b/pradigi/scripts/verify_zipfix.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +""" +Spot-check that zipfix.py has been applied across chefdata/zipfiles/. + +Samples a random subset of {hash}/webroot.zip files, extracts them in memory, +and counts: + - files containing `Utils.mobileDeviceFlag=true` (should be 0 — offending code) + - files containing `Utils.mobileDeviceFlag=false` (the fixed marker) + - files starting with `b'` which indicates the old buggy write corrupted them + +Throwaway script, not committed. Run from the worktree root. +""" +import os +import random +import sys +import zipfile +from collections import Counter + +ROOT = "chefdata/zipfiles" +SAMPLE_SIZE = int(os.environ.get("SAMPLE", 15)) +OFFENDING = b"Utils.mobileDeviceFlag=true" +FIXED = b"Utils.mobileDeviceFlag=false" + + +def scan_zip(zip_path): + """Return (offending_hits, fixed_hits, corrupted_files) across .js/.html inside zip.""" + off = fix = corrupt = 0 + try: + with zipfile.ZipFile(zip_path) as zf: + for info in zf.filelist: + if not info.filename.endswith((".js", ".html")): + continue + with zf.open(info) as f: + head = f.read(4) + rest = f.read() + content = head + rest + if head.startswith(b"b'"): + corrupt += 1 + if OFFENDING in content: + off += 1 + if FIXED in content: + fix += 1 + except zipfile.BadZipFile: + return None + return off, fix, corrupt + + +def main(): + all_hashes = [ + d for d in os.listdir(ROOT) + if os.path.isdir(os.path.join(ROOT, d)) and len(d) == 32 + ] + print(f"Found {len(all_hashes)} hash dirs under {ROOT}") + + sample = random.sample(all_hashes, min(SAMPLE_SIZE, len(all_hashes))) + totals = Counter() + missing_webroot = 0 + bad_zips = [] + + for h in sample: + zp = os.path.join(ROOT, h, "webroot.zip") + if not os.path.exists(zp): + missing_webroot += 1 + continue + result = scan_zip(zp) + if result is None: + bad_zips.append(zp) + continue + off, fix, corrupt = result + totals["offending"] += off + totals["fixed"] += fix + totals["corrupted"] += corrupt + flag = " <<<" if (off or corrupt) else "" + print(f" {h}: offending={off}, fixed={fix}, corrupt={corrupt}{flag}") + + print() + print(f"Sample size: {len(sample)}") + print(f"Missing webroot.zip: {missing_webroot}") + print(f"BadZipFile errors: {len(bad_zips)}") + print(f"Totals across sample: {dict(totals)}") + if totals["offending"] or totals["corrupted"]: + print("FAIL — zipfix output looks incomplete.") + sys.exit(1) + print("OK — no offending code and no bytestring corruption in sample.") + + +if __name__ == "__main__": + main() diff --git a/pradigi/sushichef.py b/pradigi/sushichef.py index bb9f0803..58b570b3 100755 --- a/pradigi/sushichef.py +++ b/pradigi/sushichef.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python +# CONSTANTS USED TO SELECT APPROPRIATE CLASS DURING DESERIALIZATION FROM JSON +################################################################################ """ The PraDigi chef uses a mix of content from three sources: @@ -23,13 +24,17 @@ import requests import shutil -from le_utils.constants import content_kinds, file_types, licenses +from le_utils.constants import content_kinds, file_types, licenses, format_presets from le_utils.constants.languages import getlang from ricecooker.chefs import JsonTreeChef from ricecooker.classes.licenses import get_license +from ricecooker.classes.files import RemoteFile from ricecooker.config import LOGGER from ricecooker.utils.caching import (FileCache, CacheControlAdapter) -from ricecooker.utils.jsontrees import write_tree_to_json_tree +from ricecooker.utils.jsontrees import write_tree_to_json_tree, add_files, add_questions, read_tree_from_json +from ricecooker.exceptions import raise_for_invalid_channel + +from kolibridb import get_nodes_for_remote_files, dbconnect, get_local_files, get_files from structure import GAMENAME_KEY, TAKE_FROM_KEY from structure import TEMPLATE_FOR_LANG @@ -40,13 +45,19 @@ from transform import get_zip_file from transform import get_phet_zip_file from corrections import should_skip_file +from ricecooker.classes import nodes + +from le_utils.constants import exercises +from le_utils.constants import roles + +FAIL_FILES = [] PRADIGI_DOMAIN = 'prathamopenschool.org' -PRADIGI_SOURCE_ID__VARIANT_PRATHAM = 'pradigi-videos-and-games' # Pratham internal -PRADIGI_SOURCE_ID__VARIANT_LE = 'pradigi-channel' # Studio PUBLIC channel +PRADIGI_SOURCE_ID__VARIANT_PRATHAM = 'pradigi-channel' +PRADIGI_SOURCE_ID__VARIANT_LE = 'pradigi-channel' FULL_DOMAIN_URL = 'https://www.' + PRADIGI_DOMAIN PRADIGI_LICENSE = get_license(licenses.CC_BY_NC_SA, copyright_holder='PraDigi').as_dict() PRADIGI_WEBSITE_LANGUAGES = ['hi', 'mr', 'en', 'gu', 'kn', 'bn', 'ur', 'or', 'pnb', 'ta', 'te', 'as'] @@ -59,10 +70,10 @@ # In debug mode, only one topic is downloaded. LOGGER.setLevel(logging.DEBUG) -DEBUG_MODE = True # source_urls in content desriptions +DEBUG_MODE = False # source_urls in content desriptions # WebCache logic (downloaded web resources cached for one day -- good for dev) -cache = FileCache('.webcache') +cache = FileCache('/home/jacob/LE/.pradigi-webcache') basic_adapter = CacheControlAdapter(cache=cache) develop_adapter = CacheControlAdapter(heuristic=OneDayCache(), cache=cache) session = requests.Session() @@ -70,6 +81,37 @@ session.mount('https://www.' + PRADIGI_DOMAIN, develop_adapter) +TOPIC_NODE = content_kinds.TOPIC +VIDEO_NODE = content_kinds.VIDEO +AUDIO_NODE = content_kinds.AUDIO +EXERCISE_NODE = content_kinds.EXERCISE +DOCUMENT_NODE = content_kinds.DOCUMENT +HTML5_NODE = content_kinds.HTML5 +SLIDESHOW_NODE = content_kinds.SLIDESHOW + + +# TODO(Ivan): add constants.file_types to le_utils and discuss with Jordan + +VIDEO_FILE = file_types.VIDEO +AUDIO_FILE = file_types.AUDIO +DOCUMENT_FILE = file_types.DOCUMENT +EPUB_FILE = file_types.EPUB +HTML5_FILE = file_types.HTML5 +THUMBNAIL_FILE = file_types.THUMBNAIL +SUBTITLES_FILE = file_types.SUBTITLES +SLIDESHOW_IMAGE_FILE = file_types.SLIDESHOW_IMAGE +REMOTE_FILE = "remote_file" + + +INPUT_QUESTION = exercises.INPUT_QUESTION +MULTIPLE_SELECTION = exercises.MULTIPLE_SELECTION +SINGLE_SELECTION = exercises.SINGLE_SELECTION +FREE_RESPONSE = exercises.FREE_RESPONSE +PERSEUS_QUESTION = exercises.PERSEUS_QUESTION + + +PRADIGI_CHANNEL_ID = "e832106c639854e181616015a8b87910" + # SOURCE WEBSITES ################################################################################ PRADIGI_LANG_URL_MAP = { @@ -400,7 +442,7 @@ def get_subtree_by_subject_en(lang, subject): if lang not in PRADIGI_LANG_URL_MAP: raise ValueError('Language `lang` must be in PRADIGI_LANG_URL_MAP') - wrt_filename = 'chefdata/trees/pradigi_{}_web_resource_tree.json'.format(lang) + wrt_filename = 'chefdata/vader/trees/pradigi_{}_web_resource_tree.json'.format(lang) with open(wrt_filename) as jsonfile: web_resource_tree = json.load(jsonfile) subject_subtrees = web_resource_tree['children'] @@ -425,7 +467,7 @@ def get_subtree_by_source_id(lang, source_id): """ if lang not in PRADIGI_LANG_URL_MAP: raise ValueError('Language `lang` must be in PRADIGI_LANG_URL_MAP') - wrt_filename = 'chefdata/trees/pradigi_{}_web_resource_tree.json'.format(lang) + wrt_filename = 'chefdata/vader/trees/pradigi_{}_web_resource_tree.json'.format(lang) with open(wrt_filename) as jsonfile: web_resource_tree = json.load(jsonfile) # setup recusive find function @@ -450,13 +492,15 @@ def wrt_to_ricecooker_tree(tree, lang, filter_fn=lambda node: True): """ kind = tree['kind'] if kind in ['topic_page', 'subtopic_page', 'lesson_page', 'fun_page', 'story_page', 'special_subtopic_page']: - thumbnail = tree['thumbnail_url'] if 'thumbnail_url' in tree else None + thumbnail = None + + topic_node = dict( kind=content_kinds.TOPIC, source_id=tree['source_id'], language=lang, title=tree['title'], # or could get from Strings based on subject_en... - description='source_id=' + tree['source_id'] if DEBUG_MODE else '', + description='', #source_id=' + tree['source_id'] if DEBUG_MODE else '', thumbnail=thumbnail, license=PRADIGI_LICENSE, children=[], @@ -485,7 +529,7 @@ def wrt_to_ricecooker_tree(tree, lang, filter_fn=lambda node: True): source_id=tree['source_id'], language=lang, title=tree['title'], - description=tree.get('description', ''), + description='', thumbnail=thumbnail, license=PRADIGI_LICENSE, files=[], @@ -506,22 +550,26 @@ def wrt_to_ricecooker_tree(tree, lang, filter_fn=lambda node: True): elif kind == 'PrathamZipResource': if should_skip_file(tree['url']): return None # Skip games marked with the `SKIP GAME` correction actions - thumbnail = tree['thumbnail_url'] if 'thumbnail_url' in tree else None + + thumbnail = None # tree['thumbnail_url'] if 'thumbnail_url' in tree else None + html5_node = dict( kind=content_kinds.HTML5, source_id=tree['source_id'], language=lang, title=tree['title'], - description=tree.get('description', ''), + description='', #tree.get('description', ''), thumbnail=thumbnail, license=PRADIGI_LICENSE, files=[], ) + if 'phet.zip' in tree['url']: zip_tmp_path = get_phet_zip_file(tree['url'], tree['main_file']) else: zip_tmp_path = get_zip_file(tree['url'], tree['main_file']) if zip_tmp_path is None: + FAIL_FILES.append(tree['url']) raise ValueError('Could not get zip file from %s' % tree['url']) html5zip_file = dict( file_type=file_types.HTML5, @@ -538,7 +586,7 @@ def wrt_to_ricecooker_tree(tree, lang, filter_fn=lambda node: True): source_id=tree['source_id'], language=lang, title=tree['title'], - description=tree.get('description', ''), + description='', #tree.get('description', ''), thumbnail=thumbnail, license=PRADIGI_LICENSE, files=[], @@ -586,7 +634,7 @@ def find_games_for_lang(name, lang, take_from=None): language_en = PRADIGI_STRINGS[lang]['language_en'] # ??? # load website game web resource data - WEBSITE_GAMES_OUTPUT = 'chefdata/trees/website_games_all_langs.json' + WEBSITE_GAMES_OUTPUT = 'chefdata/vader/trees/website_games_all_langs.json' website_data = json.load(open(WEBSITE_GAMES_OUTPUT, 'r')) if lang in website_data: website_data_lang = website_data[lang] @@ -611,7 +659,6 @@ def find_games_for_lang(name, lang, take_from=None): if len(games) == 0: pass - # print('game', name, 'not found for lang', lang) return games @@ -629,9 +676,9 @@ def website_game_webresouce_to_ricecooker_node(lang, web_resource): source_id=web_resource['source_id'], language=lang, title=web_resource['title'], - description='source_url=' + web_resource['url'] if DEBUG_MODE else '', + description='', license=PRADIGI_LICENSE, - thumbnail=web_resource.get('thumbnail_url'), + thumbnail=None,# web_resource.get('thumbnail_url'), files=[], ) zip_tmp_path = get_zip_file(web_resource['url'], web_resource['main_file']) @@ -642,6 +689,7 @@ def website_game_webresouce_to_ricecooker_node(lang, web_resource): language=lang, ) game_node['files'].append(zip_file) + game_node['md5hash'] = zip_tmp_path.split('/')[-1] LOGGER.debug('Created HTML5AppNode for game ' + web_resource['title']) return game_node else: @@ -692,7 +740,7 @@ def extract_website_games_from_tree(lang): if lang not in PRADIGI_LANG_URL_MAP: raise ValueError('Language `lang` must be in PRADIGI_LANG_URL_MAP') # READ IN - wrt_filename = 'chefdata/trees/pradigi_{}_web_resource_tree.json'.format(lang) + wrt_filename = 'chefdata/vader/trees/pradigi_{}_web_resource_tree.json'.format(lang) with open(wrt_filename) as jsonfile: web_resource_tree = json.load(jsonfile) # PROCESS @@ -713,7 +761,6 @@ def recursive_extract_website_games(subtree): child_url = child_url.replace('https://www.prathamopenschool.org/CourseContent/Games/', '') child_url = child_url.replace('http://www.prathamopenschool.org/CourseContent/Games/', '') child['title_en'] = child_url.replace('.zip', '') - print('EXTRACTED game name', child['title_en'], 'form url', child['url']) website_games.append(child) else: # leave other games where they are @@ -729,10 +776,400 @@ def recursive_extract_website_games(subtree): recursive_extract_website_games(web_resource_tree) return website_games +# use local file id to get storage file +def zip_storage_path(filename): + return f"storage/{filename[0]}/{filename[1]}/{filename}.zip" + +_SQLITE_PATH = f"{PRADIGI_CHANNEL_ID}.sqlite3" +if os.path.exists(_SQLITE_PATH): + conn = dbconnect(_SQLITE_PATH) + FILES = get_files(conn) + REMOTE_NODES = get_nodes_for_remote_files(PRADIGI_CHANNEL_ID) +else: + LOGGER.warning( + "Channel DB %s not found; RemoteNode thumbnails will not be populated." + % _SQLITE_PATH + ) + conn = None + FILES = [] + REMOTE_NODES = {} +LOCAL_FILES = dict() + +def _set_thumbnail_file(self, preset): + content_node_id = self.get_node_id().hex + remote_node_files = REMOTE_NODES.get(content_node_id, {}).get('files', []) + for file in remote_node_files: + if file['preset'] == preset: + remote_file = RemoteFile( + file["checksum"], + file["extension"], + file["preset"], + is_primary=False + ) + self.add_file(remote_file) + self.thumbnail = remote_file + + + + +nodes.TopicNode._set_thumbnail_file = _set_thumbnail_file +nodes.HTML5AppNode._set_thumbnail_file = _set_thumbnail_file +nodes.StudioContentNode._set_thumbnail_file = _set_thumbnail_file + + +# Shim ricecooker >=0.8 HTML5 validation: upstream assumes body.text is always +# a string, but BeautifulSoup returns None when has only child elements +# (no direct text node). Replace with a None-safe version and log offending +# paths so Section 3 can investigate. +from ricecooker.utils.pipeline.convert import HTML5ConversionHandler +from ricecooker.utils.pipeline.exceptions import InvalidFileException +import html5lib +from html5lib.html5parser import ParseError + + +def _validate_archive_safe(self, path: str): + with self.open_and_verify_archive(path) as zf: + index_html = self.read_file_from_archive(zf, "index.html") + try: + dom = html5lib.parse(index_html, namespaceHTMLElements=False) + body = dom.find("body") + if body is None: + raise InvalidFileException( + f"File {path} is not a valid HTML5 file, index.html is missing a body element." + ) + body_children = [ + c for c in body.iter() + if isinstance(c.tag, str) and c.tag != "body" + ] + body_text = (body.text or "").strip() + if not body_text and not body_children: + raise InvalidFileException( + f"File {path} is not a valid HTML5 file, index.html is empty." + ) + if body.text is None and not body_text: + LOGGER.info("html5-shim: body.text was None but had %d children in %s" % (len(body_children), path)) + except ParseError: + raise InvalidFileException( + f"File {path} is not a valid HTML5 file, index.html is not well-formed." + ) + + +HTML5ConversionHandler.validate_archive = _validate_archive_safe + +def build_tree_from_json(parent_node, sourcetree): + """ + Recusively parse nodes in the list `sourcetree` and add them as children + to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`. + """ + EXPECTED_NODE_TYPES = [ + TOPIC_NODE, + VIDEO_NODE, + AUDIO_NODE, + EXERCISE_NODE, + DOCUMENT_NODE, + HTML5_NODE, + SLIDESHOW_NODE, + ] + """ + workout the node ID from the source_id -- node_id is based on tree position & source_id + + Allow a JSON tree to accept a RemoteNode (aka StudioContentNode) + if kind == REMOTE_NODE: + child_node = nodes.RemoteNode( + source_id=source_node["source_id"], + ) + + child_node.get_node_id() # returns uid, call .hex? + """ + print("building tree from json") + + for source_node in sourcetree: + kind = source_node["kind"] + if kind not in EXPECTED_NODE_TYPES: + LOGGER.critical("Unexpected node kind found: " + kind) + raise NotImplementedError("Unexpected node kind found in json data.") + + if kind == TOPIC_NODE: + child_node = nodes.TopicNode( + source_id=source_node.get("source_id", None), + title=source_node["title"], + description=source_node.get("description"), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + # no role for topics (computed dynaically from descendants) + language=source_node.get("language"), + #thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get("derive_thumbnail", False), + tags=source_node.get("tags"), + ) + parent_node.add_child(child_node) + for i in range(len(parent_node.children)): + if parent_node.children[i].get_node_id().hex == child_node.get_node_id().hex: + for j in range(len(parent_node.children[i].files)): + if parent_node.children[i].files[j].preset == format_presets.TOPIC_THUMBNAIL: + removed = parent_node.children[i].files.pop(j) + logging.info("Removed thumbnail file from {}".format(parent_node.children[i].title)) + parent_node.children[i]._set_thumbnail_file(format_presets.TOPIC_THUMBNAIL) + source_tree_children = source_node.get("children", []) + build_tree_from_json(child_node, source_tree_children) + + elif kind == VIDEO_NODE: + child_node = nodes.VideoNode( + source_id=source_node["source_id"], + title=source_node["title"], + description='', + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get("derive_thumbnail", False), + tags=source_node.get("tags"), + ) + + add_files(child_node, source_node.get("files") or []) + parent_node.add_child(child_node) + node_id = child_node.get_node_id().hex + + # But, remove the child if it's the same ID so we can replace it w/ the remote node + removed = None + for i in range(len(parent_node.children)): + if parent_node.children[i].get_node_id().hex == node_id: + removed = parent_node.children.pop(i) + break + # use remote node if something was removed, if nothing was removed, we're done + if removed: + remote_node = nodes.StudioContentNode( + PRADIGI_CHANNEL_ID, + source_node_id=node_id, + source_content_id=removed.get_content_id().hex, + title=source_node["title"], + description='', + ) + remote_node.source_id = source_node["source_id"] + parent_node.add_child(remote_node) + #node_id = child_node.get_node_id().hex + + #add_files(child_node, source_node.get("files") or []) + #for i in range(len(parent_node.children)): + #if parent_node.children[i].get_node_id().hex == node_id: + #parent_node.children.pop(i) + #break + # figure out how to remove the child - parent_node.children.pop() + # hard code channel_id to make it work on my staging channel + #child_node = nodes.StudioContentNode(PRADIGI_CHANNEL_ID, source_node_id=node_id, title=source_node["title"]) + #import ipdb;ipdb.set_trace() + #child_node.source_id = source_node["source_id"] + #parent_node.add_child(child_node) + + elif kind == AUDIO_NODE: + child_node = nodes.AudioNode( + source_id=source_node["source_id"], + title=source_node["title"], + description='', + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get("derive_thumbnail", False), + tags=source_node.get("tags"), + ) + add_files(child_node, source_node.get("files") or []) + parent_node.add_child(child_node) + node_id = child_node.get_node_id().hex + + # But, remove the child if it's the same ID so we can replace it w/ the remote node + removed = None + for i in range(len(parent_node.children)): + if parent_node.children[i].get_node_id().hex == node_id: + removed = parent_node.children.pop(i) + break + # use remote node if something was removed, if nothing was removed, we're done + if removed: + remote_node = nodes.StudioContentNode( + PRADIGI_CHANNEL_ID, + source_node_id=node_id, + source_content_id=removed.get_content_id().hex, + title=source_node["title"], + description='', + ) + remote_node.source_id = source_node["source_id"] + parent_node.add_child(remote_node) + + elif kind == EXERCISE_NODE: + child_node = nodes.ExerciseNode( + source_id=source_node["source_id"], + title=source_node["title"], + description='', + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get( + "derive_thumbnail", False + ), # not supported yet + tags=source_node.get("tags"), + exercise_data=source_node.get("exercise_data"), + questions=[], + ) + add_questions(child_node, source_node.get("questions") or []) + parent_node.add_child(child_node) + node_id = child_node.get_node_id().hex + + # But, remove the child if it's the same ID so we can replace it w/ the remote node + removed = None + for i in range(len(parent_node.children)): + if parent_node.children[i].get_node_id().hex == node_id: + removed = parent_node.children.pop(i) + break + # use remote node if something was removed, if nothing was removed, we're done + if removed: + remote_node = nodes.StudioContentNode( + PRADIGI_CHANNEL_ID, + source_node_id=node_id, + source_content_id=removed.get_content_id().hex, + title=source_node["title"], + description='', + ) + remote_node.source_id = source_node["source_id"] + parent_node.add_child(remote_node) + + elif kind == DOCUMENT_NODE: + child_node = nodes.DocumentNode( + source_id=source_node["source_id"], + title=source_node["title"], + description='', + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + tags=source_node.get("tags"), + ) + # Things are just normal around here... + add_files(child_node, source_node.get("files") or []) + parent_node.add_child(child_node) + node_id = child_node.get_node_id().hex + + # But, remove the child if it's the same ID so we can replace it w/ the remote node + removed = None + for i in range(len(parent_node.children)): + if parent_node.children[i].get_node_id().hex == node_id: + removed = parent_node.children.pop(i) + break + # use remote node if something was removed, if nothing was removed, we're done + if removed: + remote_node = nodes.StudioContentNode( + PRADIGI_CHANNEL_ID, + source_node_id=node_id, + source_content_id=removed.get_content_id().hex, + title=source_node["title"], + description='', + ) + remote_node.source_id = source_node["source_id"] + parent_node.add_child(remote_node) + + elif kind == HTML5_NODE: + child_node = nodes.HTML5AppNode( + source_id=source_node["source_id"], + title=source_node["title"], + description='', + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + #thumbnail=source_node.get("thumbnail"), + #derive_thumbnail=source_node.get("derive_thumbnail", False), + tags=source_node.get("tags"), + ) + + # get the contentnode_id from the file's path + zip_file = [z for z in source_node.get("files", []) if z["file_type"] == "html5"][0] + files_to_add = [f for f in source_node.get("files") if f["file_type"] != "html5"] + files_to_add.append(zip_file) + add_files(child_node, files_to_add) + #try: + #except Exception as e: + #LOGGER.info(f"NO FILE FOUND: {contentnode_id} {zip_file} {contentnode_id in LOCAL_FILES}") + #LOGGER.info(e) + + parent_node.add_child(child_node) + + # But, remove the child if it's the same ID so we can replace it w/ the remote node + #for i in range(len(parent_node.children)): + #if parent_node.children[i].get_node_id().hex == node_id: + #removed = parent_node.children.pop(i) + #break + # use remote node if something was removed, if nothing was removed, we're done + #if removed: + #remote_node = nodes.StudioContentNode(PRADIGI_CHANNEL_ID, source_node_id=node_id, title=source_node["title"], description='') + #for i in range(len(remote_node.files)): + #if remote_node.files[i].preset == format_presets.HTML5_THUMBNAIL: + #remote_node.files.pop(i) + #logging.info("Removed thumbnail file from HTML5 APP {}".format(parent_node.children[i].title)) + #remote_node.source_id = source_node["source_id"] + #parent_node.add_child(remote_node) + #remote_node._set_thumbnail_file(format_presets.HTML5_THUMBNAIL) + + elif kind == SLIDESHOW_NODE: + child_node = nodes.SlideshowNode( + source_id=source_node["source_id"], + title=source_node["title"], + description='', + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get("derive_thumbnail", False), + tags=source_node.get("tags"), + ) + add_files(child_node, source_node.get("files") or []) + parent_node.add_child(child_node) + node_id = child_node.get_node_id().hex + + # But, remove the child if it's the same ID so we can replace it w/ the remote node + removed = None + for i in range(len(parent_node.children)): + if parent_node.children[i].get_node_id().hex == node_id: + removed = parent_node.children.pop(i) + break + # use remote node if something was removed, if nothing was removed, we're done + if removed: + remote_node = nodes.StudioContentNode( + PRADIGI_CHANNEL_ID, + source_node_id=node_id, + source_content_id=removed.get_content_id().hex, + title=source_node["title"], + description='', + ) + remote_node.source_id = source_node["source_id"] + parent_node.add_child(remote_node) + + # TODO: add support for H5P content kind + else: + LOGGER.critical("Encountered an unknown kind: " + str(source_node)) + continue + return parent_node @@ -743,7 +1180,7 @@ class PraDigiChef(JsonTreeChef): """ SushiChef script for importing and merging the content from these sources: - Video, PDFs, and interactive demos from http://www.prathamopenschool.org/ - - Games from http://www.prathamopenschool.org/ + - Games from http://www.prathamopenschool.org/ """ RICECOOKER_JSON_TREE = 'pradigi_ricecooker_json_tree.json' @@ -753,7 +1190,7 @@ def crawl(self, args, options): Crawl website and save web resource trees in chefdata/trees/. """ from pradigi_crawlers import PraDigiCrawler - + # website for lang in PRADIGI_WEBSITE_LANGUAGES: website_crawler = PraDigiCrawler(lang=lang) @@ -764,15 +1201,27 @@ def crawl(self, args, options): for lang in PRADIGI_WEBSITE_LANGUAGES: lang_games = extract_website_games_from_tree(lang) website_games[lang] = lang_games - WEBSITE_GAMES_OUTPUT = 'chefdata/trees/website_games_all_langs.json' + WEBSITE_GAMES_OUTPUT = 'chefdata/vader/trees/website_games_all_langs.json' # Save website games with open(WEBSITE_GAMES_OUTPUT, 'w') as json_file: json.dump(website_games, json_file, ensure_ascii=False, indent=2, sort_keys=True) + def construct_channel(self, **kwargs): + """ + Build the channel tree by adding TopicNodes and ContentNode children. + """ + channel = self.get_channel(**kwargs) + json_tree_path = self.get_json_tree_path(**kwargs) + json_tree = read_tree_from_json(json_tree_path) + build_tree_from_json(channel, json_tree["children"]) + ## Create video node, get node ID, create RemoteNode in its place + ## vendor build_tree_from_json -- know vidoe & doc nodes need to be remove nodes + raise_for_invalid_channel(channel) + return channel def build_subtree_for_lang(self, lang): LOGGER.info('Building subtree for lang {}'.format(lang)) - + lang_subtree = copy.deepcopy(TEMPLATE_FOR_LANG) lang_obj = getlang(lang) language_en = PRADIGI_STRINGS[lang]['language_en'] @@ -874,9 +1323,10 @@ def build_subtree_for_lang(self, lang): # TODO: check for empty sub-folders too age_groups_subtree['children'] = nonempty_subject_subtrees - # Special handling for '3-6 years' children: - # Replace the contents of the first child (KhelBadi) with its children - # then append the any remaining nodes in this age group + # Flatten the '3-6 years' agre group to contain contents of KhelBadi + # if age_groups_subtree['title'] == '3-6 years' and len(age_groups_subtree['children']) == 1: + # khelbadi_subtree = age_groups_subtree['children'][0] + # age_groups_subtree['children'] = khelbadi_subtree['children'] if age_groups_subtree['title'] == '3-6 years' and age_groups_subtree['children']: new_children = [] khelbadi_subtree = age_groups_subtree['children'][0] @@ -884,6 +1334,11 @@ def build_subtree_for_lang(self, lang): other_subtrees = age_groups_subtree['children'][1:] new_children.extend(other_subtrees) age_groups_subtree['children'] = new_children + # flat_subfolders = [] + # for folder in age_groups_subtree['children']: + # for subfolder in folder['children']: + # flat_subfolders.append(subfolder) + # age_groups_subtree['children'] = flat_subfolders return lang_subtree @@ -895,7 +1350,7 @@ def pre_run(self, args, options): LOGGER.info('in pre_run...') # delete .zip files in temporary dir when running using update - if args['update']: + if False or args['update']: LOGGER.info('Deleting all zips in cache dir {}'.format(HTML5APP_ZIPS_LOCAL_DIR)) for rel_path in os.listdir(HTML5APP_ZIPS_LOCAL_DIR): abs_path = os.path.join(HTML5APP_ZIPS_LOCAL_DIR, rel_path) @@ -903,12 +1358,12 @@ def pre_run(self, args, options): shutil.rmtree(abs_path) # option to skip crawling stage - if 'nocrawl' not in options: - self.crawl(args, options) + #if 'nocrawl' not in options: + #self.crawl(args, options) # Conditionally determine `source_id` depending on variant specified if 'variant' in options and options['variant'].upper() == 'LE': - # Official PraDigi channel = + # Official PraDigi channel = channel_name = 'PraDigi' channel_source_id = PRADIGI_SOURCE_ID__VARIANT_LE DEBUG_MODE = False diff --git a/pradigi/transform.py b/pradigi/transform.py index e1fa47af..4e5b2aee 100644 --- a/pradigi/transform.py +++ b/pradigi/transform.py @@ -72,134 +72,48 @@ def get_zip_file(zip_file_url, main_file): THe `main_file` needs to be renamed to index.html to make it compatible with Kolibri. """ key = zip_file_url + main_file - destpath = make_temporary_dir_from_key(key) - - # Check for "REPLACE WITH:" correction rule for the current `zip_file_url` - replacement_url = should_replace_with(zip_file_url) - if replacement_url: - zip_file_url = replacement_url - - # return cached version if already there + key_bytes = key.encode('utf-8') + m = hashlib.md5() + m.update(key_bytes) + subdir = m.hexdigest() + destpath = os.path.join("chefdata/zipfiles/", subdir) final_webroot_path = os.path.join(destpath, 'webroot.zip') if os.path.exists(final_webroot_path): - return final_webroot_path + # Existence isn't enough: we've seen 22-byte empty zips in the cache + # that pass this check but fail later at ricecooker validation with a + # confusing "No required format preset" error. Verify the zip has at + # least one entry before returning its path. + try: + with zipfile.ZipFile(final_webroot_path) as zf: + names = zf.namelist() + if not names: + LOGGER.error( + "get_zip_file: %s exists but is an EMPTY zip (url=%s)" + % (final_webroot_path, zip_file_url) + ) + elif "index.html" not in names: + LOGGER.error( + "get_zip_file: %s has no index.html at root (url=%s). " + "top entries: %s" + % (final_webroot_path, zip_file_url, names[:3]) + ) + else: + return final_webroot_path + except zipfile.BadZipFile as e: + LOGGER.error( + "get_zip_file: %s exists but is NOT a valid zip: %s (url=%s)" + % (final_webroot_path, e, zip_file_url) + ) else: - LOGGER.error("Now we need local files so we can process them: %s" % final_webroot_path) - - try: - download_file(zip_file_url, destpath, request_fn=make_request) - - zip_filename = zip_file_url.split('/')[-1] # e.g. Mathematics.zip - zip_basename = zip_filename.rsplit('.', 1)[0] # e.g. Mathematics/ - - # July 31: handle ednge cases where zip filename doesn't match folder name inside it - awazchitras = ['Awazchitra_HI', 'Awazchitra_TL', 'Awazchitra_KN', - 'Awazchitra_BN', 'Awazchitra_OD', 'Awazchitra_PN', 'Awazchitra_TM'] - for awazchitra in awazchitras: - if awazchitra in zip_basename: - zip_basename = zip_basename.replace('Awazchitra', 'AwazChitra') - if '_KKS_Hi' in zip_basename: - zip_basename = zip_basename.replace('_KKS_Hi', '_KKS_HI') - - # Mar 2: more edge cases where zip filename doesn't match folder name inside it - if 'Memorygamekb' in zip_basename: - zip_basename = zip_basename.replace('Memorygamekb', 'MemoryGamekb') - if 'cityofstories' in zip_basename: - zip_basename = zip_basename.replace('cityofstories', 'CityOfStories') - - # Jun 12: fix more edge cases where .zip filename doesn't match dir name - if '_KKS_Gj' in zip_basename: - zip_basename = zip_basename.replace('_KKS_Gj', '_KKS_GJ') - if 'ShabdKhel' in zip_basename: - zip_basename = zip_basename.replace('ShabdKhel', 'Shabdkhel') - - zip_folder = os.path.join(destpath, zip_basename) # e.g. destpath/Mathematics/ - main_file = main_file.split('/')[-1] # e.g. activity_name.html or index.html - - if 'KhelbadiKahaniyan_MR' in zip_basename: - # Inconsistency --- `main_file` contains dir name, and not index.html - main_file = 'index.html' - - # Jul 8th: handle weird case-insensitive webserver main_file - if main_file == 'mainexpand.html': - main_file = 'mainExpand.html' # <-- this is the actual filename in the zip - - # Zip files from Pratham website have the web content inside subfolder - # of the same as the zip filename. We need to recreate these zip files - # to make sure the index.html is in the root of the zip. - local_zip_file = os.path.join(destpath, zip_filename) - with zipfile.ZipFile(local_zip_file) as zf: - # If main_file is in the root (like zips from the game repository) - # then we need to extract the zip contents to subfolder zip_basename/ - for zfileinfo in zf.filelist: - if zfileinfo.filename == main_file: - destpath = os.path.join(destpath, zip_basename) - # Extract zip so main file will be in destpath/zip_basename/index.html - zf.extractall(destpath) - - # In some cases, the files are under the www directory, - # let's move them up one level. - www_dir = os.path.join(zip_folder, 'www') - if os.path.isdir(www_dir): - files = os.listdir(www_dir) - for f in files: - shutil.move(os.path.join(www_dir, f), zip_folder) - - # Rename `main_file` to index.html - src = os.path.join(zip_folder, main_file) - dest = os.path.join(zip_folder, 'index.html') - os.rename(src, dest) - - # Logic to add margin-top:44px; for games that match Corrections tab - add_margin_top = False - for row in PRADIGI_CORRECTIONS_LIST: - if row[CORRECTIONS_ACTION_KEY] == ADD_MARGIN_TOP_ACTION: - pat = row[CORRECTIONS_SOURCE_URL_PAT_KEY] - m = pat.match(zip_file_url) - if m: - add_margin_top = True - if add_margin_top: - if zip_file_url.endswith('CourseContent/Games/Mathematics.zip'): - LOGGER.info("adding body.margin-top:44px; to ALL .html files in: %s" % zip_file_url) - for root, dirs, files in os.walk(zip_folder): - for file in files: - if file.endswith(".html"): - add_body_margin_top(root, file) - else: - LOGGER.info("adding body.margin-top:44px; to index.html in: %s" % zip_file_url) - add_body_margin_top(zip_folder, 'index.html') - - # Replace occurences of `main_file` with index.html to avoid broken links - for root, dirs, files in os.walk(zip_folder): - for file in files: - if file.endswith(".html") or file.endswith(".js"): - file_path = os.path.join(root, file) - # use bytes to avoid Unicode errors "invalid start/continuation byte" - bytes_in = open(file_path, 'rb').read() - bytes_out = bytes_in.replace(main_file.encode('utf-8'), b'index.html') - open(file_path, 'wb').write(bytes_out) - - for root, dirs, files in os.walk(zip_folder): - for file in files: - if file.endswith(".js"): - LOGGER.info("Fixing Android bug in JS file: %s" % file) - with open(file, 'w') as f: - content = f.read() - content = content.replace( - 'Utils.mobileDeviceFlag=true', - 'Utils.mobileDeviceFlag=false' - ) - f.write(content) - f.close() - # create the zip file and copy it to - tmp_predictable_zip_path = create_predictable_zip(zip_folder) - shutil.copyfile(tmp_predictable_zip_path, final_webroot_path) - return final_webroot_path - - except Exception as e: - LOGGER.error("get_zip_file: %s, %s, %s, %s" % - (zip_file_url, main_file, destpath, e)) - return None + LOGGER.error("get_zip_file: no webroot.zip at %s (url=%s, main_file=%s)" + % (final_webroot_path, zip_file_url, main_file)) + if os.path.isdir(destpath): + for root, dirs, files in os.walk(destpath): + if files: + LOGGER.error(" dir contents: %s" % files) + else: + LOGGER.error(" hash dir does not exist at all") + return None PHET_INDEX_HTML_TEMPLATE = """ diff --git a/pradigi/zipfix.py b/pradigi/zipfix.py new file mode 100644 index 00000000..e00cef81 --- /dev/null +++ b/pradigi/zipfix.py @@ -0,0 +1,113 @@ +import os +import shutil +import tempfile +import zipfile +import ntpath +import logging +from ricecooker.utils.zip import create_predictable_zip + +logging.basicConfig(level=logging.INFO, filename="zipfix.log") + +LIKELY_OFFENDING = "Utils.mobileDeviceFlag" +OFFENDING_CODE = "Utils.mobileDeviceFlag=true" +FIXED_CODE = "Utils.mobileDeviceFlag=false" + +def apply_fix_better(filepath, dryrun=False): + """ + Traverse files and replace offending code with fixed code, handling Unicode issues. + + Args: + filepath (str): Path to the directory or file. + dryrun (bool): If True, do not modify files. + """ + for root, dirs, files in os.walk(filepath): + for file in files: + if file.endswith((".js", ".html")): + file_path = os.path.join(root, file) + content = None + + # Attempt to read the file with utf-8 encoding + try: + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + + # Handle UnicodeDecodeError by trying an alternate encoding + except UnicodeDecodeError: + logging.warning("UTF-8 decoding failed for %s. Trying unicode_escape.", file_path) + try: + with open(file_path, "r", encoding="unicode_escape") as f: + content = f.read() + except UnicodeDecodeError: + logging.error("Cannot decode %s with either UTF-8 or unicode_escape. Skipping.", file_path) + continue + + # If content was successfully read, perform the fix + if content: + #logging.info("First chars of {}: {}".format(file_path, content[0:100])) + + # Fix bytestring corruption: files that start with b' and end with ' + if content.startswith("b'") and len(content) > 2: + logging.info("Fixing bytestring corruption in %s", file_path) + # Strip b' prefix + content = content[2:] + # Strip trailing ' if present + if content.endswith("'"): + content = content[:-1] + # Decode escape sequences like \r\n, \t, etc. + try: + content = content.encode('utf-8').decode('unicode_escape') + except UnicodeDecodeError: + logging.error("Failed to decode escape sequences in %s", file_path) + + if FIXED_CODE in content: + logging.info("Previously fixed: %s", file_path) + + if OFFENDING_CODE in content: + logging.info("Fixing mobile device flag in %s", file_path) + content = content.replace(OFFENDING_CODE, FIXED_CODE) + + if not dryrun: + with open(file_path, "w", encoding="utf-8") as f: + logging.info("Writing back... {}".format(file_path)) + f.write(content) + # No need for manual recursion - os.walk already handles subdirectories + +def fix_zips(path, dryrun=False): + """ + Process a zip file, extract, fix offending code, and repackage. + + Args: + path (str): Path to the zip file. + dryrun (bool): If True, do not modify files. + """ + zip_filename = ntpath.basename(path) + extract_path = tempfile.mkdtemp() + + + try: + # Extract zip file contents + with zipfile.ZipFile(path) as zf: + zf.extractall(extract_path) + logging.info("Extracted %s to %s", path, extract_path) + + # Apply fixes to the extracted files + apply_fix_better(extract_path, dryrun) + + except zipfile.BadZipFile: + logging.error("%s is not a valid zip file. Skipping.", zip_filename) + + finally: + # Ensure cleanup of the temporary directory + tmp_predictable_zip_path = create_predictable_zip(extract_path) + shutil.copyfile(tmp_predictable_zip_path, path) + shutil.rmtree(extract_path) + os.remove(tmp_predictable_zip_path) + logging.info("Cleaned up temporary directory %s", extract_path) + +# Main loop to process all zip files in the target directory +for root, dirs, files in os.walk('./chefdata/zipfiles/'): + for name in files: + if name.endswith('.zip'): + path = os.path.join(root, name) + logging.info("Processing zip file: %s", path) + fix_zips(path, dryrun=False) # Set dryrun=True for testin