From ad6c2f9ff054f7a3c91bc134faaa74f7520a3550 Mon Sep 17 00:00:00 2001
From: Jacob Pierce <w.jacobpierce@gmail.com>
Date: Wed, 6 May 2026 09:37:06 -0700
Subject: [PATCH] PraDigi: Android HTML5 fix + fresh chef from imported Kolibri
 DB

The PraDigi channel's HTML5 apps fail to load on Android because their
JS sets `Utils.mobileDeviceFlag=true`, which calls into an Android-only
native bridge that isn't always present. This change ships the fix and
rebuilds the channel from a trustworthy source.

The previous chef (`sushichef.py` + `transform.py` + `pradigi_crawlers.py`
+ `structure.py`) couldn't produce a clean rebuild: the crawl JSONs at
`chefdata/vader/trees/` are stale (3,443 videos vs the live channel's
4,686), local zip caches contain partial/empty/no-index zips that pass
`os.path.exists()` but fail HTML5 validation, and the prior in-tree
Android fix at `transform.py` opened files in `'w'` mode then read from
them, corrupting at least one zip with a `b'...'` bytestring signature.

`fresh_chef.py` walks the imported Kolibri channel DB
(`.kolibri/content/databases/e832106c639854e181616015a8b87910.sqlite3`)
and emits topics, html5 apps, videos, and documents directly from local
storage. `content_id` is preserved from the DB so node_ids stay stable
and Kolibri user progress carries through. The Android fix is applied
inline in `prepare_html5_zip(checksum)` and only re-zips when the source
contains `Utils.mobileDeviceFlag=true`; unchanged zips pass through with
their checksum intact, skipping re-upload entirely.

A monkey-patch shims `ricecooker 0.8.0`'s
`HTML5ConversionHandler.validate_archive`, which crashes when
`<body>.text is None` in BeautifulSoup output.

`sushichef.py` and `transform.py` are no longer used by the new chef but
are kept here for review continuity (and for git blame). Safe to delete
in a follow-up. Surface bugs that surfaced during recovery (NameError in
`build_tree_from_json`, broken android-fix block) are fixed.

Helper scripts:
- `scripts/verify_zipfix.py` spot-checks that zipfix has been applied.
- `scripts/scan_all_zips.py` does a full integrity scan of
  `chefdata/zipfiles/` for empty/missing/no-index zips.
- `scripts/html5_test_server.py` is a tiny LAN server that lists every
  HTML5 app and serves the fixed zips for testing on a real Android
  device.

`zipfix.py` is restored from the prior maintainer's stash; it is a
one-shot Android-fix script for the legacy `chefdata/zipfiles/` layout.
The new chef applies the fix inline so this script is legacy.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pradigi/fresh_chef.py                | 369 +++++++++++++++++++
 pradigi/kolibridb.py                 | 439 ++++++++++++++++++++++
 pradigi/scripts/html5_test_server.py | 158 ++++++++
 pradigi/scripts/scan_all_zips.py     |  79 ++++
 pradigi/scripts/verify_zipfix.py     |  88 +++++
 pradigi/sushichef.py                 | 519 +++++++++++++++++++++++++--
 pradigi/transform.py                 | 164 ++-------
 pradigi/zipfix.py                    | 113 ++++++
 8 files changed, 1772 insertions(+), 157 deletions(-)
 create mode 100644 pradigi/fresh_chef.py
 create mode 100644 pradigi/kolibridb.py
 create mode 100644 pradigi/scripts/html5_test_server.py
 create mode 100644 pradigi/scripts/scan_all_zips.py
 create mode 100644 pradigi/scripts/verify_zipfix.py
 create mode 100644 pradigi/zipfix.py
diff --git a/pradigi/fresh_chef.py b/pradigi/fresh_chef.py
new file mode 100644
index 00000000..5e3c77e6
--- /dev/null
+++ b/pradigi/fresh_chef.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python
+"""
+PraDigi fresh chef — builds the channel directly from an imported Kolibri
+DB snapshot instead of from stale crawl JSONs.
+
+    ./fresh_chef.py -v --token=$STUDIO_PRODUCTION_ADMIN_TOKEN --stage
+
+Required env:
+    STUDIO_URL=$HOTFIXES    (staging Studio for QA; default points at production)
+    KOLIBRI_HOME=<path>     (optional; defaults to ~/.kolibri or the repo's
+                             symlinked .kolibri)
+
+Shape of the work:
+  - topics → TopicNode, with RemoteFile thumbnail if present in DB
+  - html5  → HTML5AppNode uploading the local zip from Kolibri storage with
+             the Android `Utils.mobileDeviceFlag=true` → `=false` fix applied
+             (only re-zipped when an actual change was made, so unchanged zips
+             keep their checksum and Studio dedupes the upload)
+  - other (video / audio / document / exercise / slideshow) → StudioContentNode
+             referencing the same PraDigi channel on Studio; both
+             source_node_id AND source_content_id are sent so Studio resolves
+             even if node_ids drift
+  - content_ids are preserved one-for-one with the source DB, so existing
+    Kolibri users keep their progress across this republish
+"""
+import hashlib
+import logging
+import os
+import shutil
+import sqlite3
+import tempfile
+import uuid
+import zipfile
+
+import html5lib
+from html5lib.html5parser import ParseError
+from le_utils.constants import licenses
+from ricecooker.chefs import SushiChef
+from ricecooker.classes import files, nodes
+from ricecooker.classes.licenses import get_license
+from ricecooker.config import LOGGER
+from ricecooker.utils.pipeline.convert import HTML5ConversionHandler
+from ricecooker.utils.pipeline.exceptions import InvalidFileException
+from ricecooker.utils.zip import create_predictable_zip
+
+
+# ---------- ricecooker 0.8 HTML5 validate_archive None-safe shim ----------
+# Upstream assumes body.text is always a string; it's None when <body> has
+# only child elements. Without this shim, one bad-shaped index.html crashes
+# the whole ThreadPool in process_files.
+def _validate_archive_safe(self, path):
+    with self.open_and_verify_archive(path) as zf:
+        index_html = self.read_file_from_archive(zf, "index.html")
+        try:
+            dom = html5lib.parse(index_html, namespaceHTMLElements=False)
+            body = dom.find("body")
+            if body is None:
+                raise InvalidFileException(
+                    f"File {path} is not a valid HTML5 file, index.html is missing a body element."
+                )
+            body_children = [
+                c for c in body.iter()
+                if isinstance(c.tag, str) and c.tag != "body"
+            ]
+            if not (body.text or "").strip() and not body_children:
+                raise InvalidFileException(
+                    f"File {path} is not a valid HTML5 file, index.html is empty."
+                )
+        except ParseError:
+            raise InvalidFileException(
+                f"File {path} is not a valid HTML5 file, index.html is not well-formed."
+            )
+
+
+HTML5ConversionHandler.validate_archive = _validate_archive_safe
+
+
+# ---------------------------------------------------------------- CONFIG
+PRADIGI_CHANNEL_ID = "e832106c639854e181616015a8b87910"
+PRADIGI_DOMAIN = "prathamopenschool.org"
+PRADIGI_SOURCE_ID = "pradigi-channel"
+
+KOLIBRI_HOME = os.environ.get("KOLIBRI_HOME") or os.path.expanduser(
+    "/var/home/jacob/LE/repos/content-integration/pradigi/.kolibri"
+)
+KOLIBRI_DB = os.path.join(
+    KOLIBRI_HOME, "content", "databases", f"{PRADIGI_CHANNEL_ID}.sqlite3"
+)
+KOLIBRI_STORAGE = os.path.join(KOLIBRI_HOME, "content", "storage")
+
+FIXED_ZIPS_DIR = "chefdata/fixed_zips"
+OFFENDING = b"Utils.mobileDeviceFlag=true"
+FIXED = b"Utils.mobileDeviceFlag=false"
+
+LICENSE_MAP = {
+    "CC BY": licenses.CC_BY,
+    "CC BY-SA": licenses.CC_BY_SA,
+    "CC BY-ND": licenses.CC_BY_ND,
+    "CC BY-NC": licenses.CC_BY_NC,
+    "CC BY-NC-SA": licenses.CC_BY_NC_SA,
+    "CC BY-NC-ND": licenses.CC_BY_NC_ND,
+    "All Rights Reserved": licenses.ALL_RIGHTS_RESERVED,
+    "Public Domain": licenses.PUBLIC_DOMAIN,
+    "Special Permissions": licenses.SPECIAL_PERMISSIONS,
+}
+DEFAULT_LICENSE = licenses.CC_BY_NC_SA
+
+LOGGER.setLevel(logging.INFO)
+
+
+# --------------------------------------------------------------- HELPERS
+def dbconn():
+    conn = sqlite3.connect(KOLIBRI_DB)
+    conn.row_factory = sqlite3.Row
+    return conn
+
+
+def storage_path(checksum, extension):
+    """Kolibri stores files under storage/{cksum[0]}/{cksum[1]}/{cksum}.{ext}."""
+    return os.path.join(
+        KOLIBRI_STORAGE, checksum[0], checksum[1], f"{checksum}.{extension}"
+    )
+
+
+def make_license(license_name, owner):
+    license_id = LICENSE_MAP.get(license_name, DEFAULT_LICENSE)
+    holder = owner or "PraDigi"
+    return get_license(license_id, copyright_holder=holder)
+
+
+def get_node_files(conn, contentnode_id):
+    return conn.execute(
+        "SELECT local_file_id, extension, preset FROM content_file "
+        "WHERE contentnode_id=?",
+        (contentnode_id,),
+    ).fetchall()
+
+
+# --------------------------------------- HTML5: Android-fix + predictable zip
+def _zip_contains_offending(zip_path):
+    with zipfile.ZipFile(zip_path) as zf:
+        for info in zf.infolist():
+            if not info.filename.endswith((".js", ".html")):
+                continue
+            try:
+                if OFFENDING in zf.read(info):
+                    return True
+            except Exception:
+                continue
+    return False
+
+
+def prepare_html5_zip(checksum):
+    """
+    Return the path to a local zip file for the given Kolibri storage checksum.
+    If the original zip contains `Utils.mobileDeviceFlag=true`, extract, replace,
+    and re-pack via create_predictable_zip, caching the result under
+    chefdata/fixed_zips/. Otherwise use the storage file directly so its
+    existing checksum is preserved and Studio dedupes the upload.
+    """
+    src = storage_path(checksum, "zip")
+    if not os.path.isfile(src):
+        raise FileNotFoundError(src)
+
+    if not _zip_contains_offending(src):
+        return src
+
+    os.makedirs(FIXED_ZIPS_DIR, exist_ok=True)
+    fixed_path = os.path.join(FIXED_ZIPS_DIR, f"{checksum}.zip")
+    if os.path.isfile(fixed_path):
+        return fixed_path
+
+    with tempfile.TemporaryDirectory() as tmp:
+        with zipfile.ZipFile(src) as zf:
+            zf.extractall(tmp)
+        for root, _, fnames in os.walk(tmp):
+            for fname in fnames:
+                if not fname.endswith((".js", ".html")):
+                    continue
+                p = os.path.join(root, fname)
+                try:
+                    data = open(p, "rb").read()
+                except OSError:
+                    continue
+                if OFFENDING in data:
+                    open(p, "wb").write(data.replace(OFFENDING, FIXED))
+        built = create_predictable_zip(tmp)
+    shutil.copyfile(built, fixed_path)
+    os.unlink(built)
+    return fixed_path
+
+
+# --------------------------------------------------------- tree walk / build
+class PraDigiFreshChef(SushiChef):
+    channel_info = {
+        "CHANNEL_SOURCE_DOMAIN": PRADIGI_DOMAIN,
+        "CHANNEL_SOURCE_ID": PRADIGI_SOURCE_ID,
+        "CHANNEL_TITLE": "PraDigi",
+        "CHANNEL_LANGUAGE": "mul",
+        "CHANNEL_THUMBNAIL": "chefdata/prathamlogo_b01-v1.jpg",
+        "CHANNEL_DESCRIPTION": (
+            "Developed by Pratham, these educational games, videos, and ebooks "
+            "are designed to teach language learning, math, science, English, "
+            "health, and vocational training in Hindi, Marathi, Odia, Bengali, "
+            "Urdu, Punjabi, Kannada, Tamil, Telugu, Gujarati and Assamese. "
+            "Materials are designed for learners of all ages, including those "
+            "outside the formal classroom setting."
+        ),
+    }
+
+    def construct_channel(self, **kwargs):
+        if not os.path.isfile(KOLIBRI_DB):
+            raise RuntimeError(
+                f"Kolibri DB not found at {KOLIBRI_DB}. Set KOLIBRI_HOME or "
+                f"import the PraDigi channel into a local Kolibri instance."
+            )
+        channel = self.get_channel(**kwargs)
+        conn = dbconn()
+        root_id = conn.execute(
+            "SELECT root_id FROM content_channelmetadata WHERE id=?",
+            (PRADIGI_CHANNEL_ID,),
+        ).fetchone()[0]
+        LOGGER.info(f"walking tree under root {root_id}")
+        self._build_subtree(channel, root_id, conn)
+        LOGGER.info("tree built")
+        return channel
+
+    def _build_subtree(self, parent_node, db_parent_id, conn):
+        rows = conn.execute(
+            "SELECT * FROM content_contentnode "
+            "WHERE parent_id=? AND available=1 "
+            "ORDER BY sort_order, title",
+            (db_parent_id,),
+        ).fetchall()
+        for row in rows:
+            child = self._build_one(row, conn)
+            if child is None:
+                continue
+            parent_node.add_child(child)
+            # Preserve original content_id → preserves node_id cascade →
+            # Kolibri user progress carries over.
+            # Also force domain_ns init; to_dict() reads self.domain_ns directly
+            # and it's normally populated as a side-effect of get_content_id(),
+            # which we bypass by setting content_id ourselves.
+            child.get_domain_namespace()
+            if row["content_id"]:
+                child.content_id = uuid.UUID(hex=row["content_id"])
+            if row["kind"] == "topic":
+                self._build_subtree(child, row["id"], conn)
+
+    def _build_one(self, row, conn):
+        kind = row["kind"]
+        title = row["title"]
+        lang = row["lang_id"]
+        description = row["description"] or ""
+        # Stable source_id: use the DB's content_id. Combined with the content_id
+        # override in _build_subtree, this keeps the new channel's node_ids
+        # identical to the source channel's.
+        source_id = row["content_id"]
+
+        node_files = get_node_files(conn, row["id"])
+
+        if kind == "topic":
+            topic = nodes.TopicNode(
+                source_id=source_id,
+                title=title,
+                language=lang,
+                description=description,
+            )
+            self._attach_local_thumbnail(topic, node_files)
+            return topic
+
+        license_obj = make_license(row["license_name"], row["license_owner"])
+
+        if kind == "html5":
+            zip_row = next((f for f in node_files if f["extension"] == "zip"), None)
+            if zip_row is None:
+                LOGGER.warning(f"html5 node has no zip, skipping: {title}")
+                return None
+            try:
+                zip_path = prepare_html5_zip(zip_row["local_file_id"])
+            except Exception as e:
+                LOGGER.error(
+                    f"failed to prepare zip {zip_row['local_file_id']} for {title}: {e}"
+                )
+                return None
+            node = nodes.HTML5AppNode(
+                source_id=source_id,
+                title=title,
+                license=license_obj,
+                language=lang,
+                description=description,
+            )
+            node.add_file(files.HTMLZipFile(path=zip_path, language=lang))
+            self._attach_local_thumbnail(node, node_files)
+            return node
+
+        if kind == "video":
+            # Prefer high-res if present, else low-res, else any mp4/webm/m4v.
+            vrow = next((f for f in node_files if f["preset"] == "high_res_video"), None)
+            if vrow is None:
+                vrow = next((f for f in node_files if f["preset"] == "low_res_video"), None)
+            if vrow is None:
+                vrow = next(
+                    (f for f in node_files if f["extension"] in ("mp4", "webm", "m4v")),
+                    None,
+                )
+            if vrow is None:
+                LOGGER.warning(f"video has no playable file: {title}")
+                return None
+            path = storage_path(vrow["local_file_id"], vrow["extension"])
+            if not os.path.isfile(path):
+                LOGGER.warning(f"video file missing on disk: {path}")
+                return None
+            node = nodes.VideoNode(
+                source_id=source_id,
+                title=title,
+                license=license_obj,
+                language=lang,
+                description=description,
+            )
+            node.add_file(files.VideoFile(path=path, language=lang))
+            self._attach_local_thumbnail(node, node_files)
+            return node
+
+        if kind == "document":
+            drow = next((f for f in node_files if f["preset"] == "document"), None)
+            if drow is None:
+                drow = next(
+                    (f for f in node_files if f["extension"] in ("pdf", "epub")), None
+                )
+            if drow is None:
+                LOGGER.warning(f"document has no file: {title}")
+                return None
+            path = storage_path(drow["local_file_id"], drow["extension"])
+            if not os.path.isfile(path):
+                LOGGER.warning(f"document file missing on disk: {path}")
+                return None
+            cls = files.EPubFile if drow["extension"] == "epub" else files.DocumentFile
+            node = nodes.DocumentNode(
+                source_id=source_id,
+                title=title,
+                license=license_obj,
+                language=lang,
+                description=description,
+            )
+            node.add_file(cls(path=path, language=lang))
+            self._attach_local_thumbnail(node, node_files)
+            return node
+
+        LOGGER.warning(f"unsupported kind {kind!r}, skipping: {title}")
+        return None
+
+    def _attach_local_thumbnail(self, node, node_files):
+        """Upload thumbnail from Kolibri storage so nothing depends on remote lookups."""
+        t = next(
+            (f for f in node_files if f["preset"] and "thumbnail" in f["preset"].lower()),
+            None,
+        )
+        if not t:
+            return
+        path = storage_path(t["local_file_id"], t["extension"])
+        if not os.path.isfile(path):
+            return
+        node.add_file(files.ThumbnailFile(path=path))
+
+
+if __name__ == "__main__":
+    PraDigiFreshChef().main()
diff --git a/pradigi/kolibridb.py b/pradigi/kolibridb.py
new file mode 100644
index 00000000..ffce01d2
--- /dev/null
+++ b/pradigi/kolibridb.py
@@ -0,0 +1,439 @@
+#!/usr/bin/env python
+"""
+Helpers for downloding Kolibri databases and printing topic trees:
+
+    ./kolibridb.py --channel_id=95a52b386f2c485cb97dd60901674a98
+
+or to get the same result as HTML (assuming you have `pandoc` installed):
+
+    ./kolibridb.py --channel_id=95a52b386f2c485cb97dd60901674a98 --htmlexport
+
+"""
+import argparse
+from collections import defaultdict
+from contextlib import redirect_stdout
+from itertools import groupby
+from operator import itemgetter
+import os
+import io
+import json
+import requests
+import sqlite3
+import subprocess
+import uuid
+
+
+# DATABASE
+################################################################################
+
+DATABASES_DIR = "chefdata/databases"
+
+STUDIO_SERVER_LOOKUP = {
+    "production": "https://studio.learningequality.org",
+    "develop": "https://develop.studio.learningequality.org",
+    "local": "http://localhost:8080",
+}
+
+
+def download_db_file(channel_id, server="production", update=False):
+    """
+    Download DB file for Kolibri channel `channel_id` from a Studio server.
+    """
+    os.makedirs(DATABASES_DIR, exist_ok=True)
+    db_file_path = os.path.join(DATABASES_DIR, channel_id + ".sqlite3")
+    if os.path.exists(db_file_path) and not update:
+        return db_file_path
+    if server in STUDIO_SERVER_LOOKUP.keys():
+        base_url = STUDIO_SERVER_LOOKUP[server]
+    elif "http" in server:
+        base_url = server.rstrip("/")
+    else:
+        raise ValueError("Unrecognized arg", server)
+    db_file_url = base_url + "/content/databases/" + channel_id + ".sqlite3"
+    response = requests.get(db_file_url)
+    if response.ok:
+        with open(db_file_path, "wb") as db_file:
+            for chunk in response:
+                db_file.write(chunk)
+        return db_file_path
+    else:
+        print(response.status_code, response.content)
+        raise ConnectionError("Failed to download DB file from", db_file_url)
+
+
+def dbconnect(db_file_path):
+    conn = sqlite3.connect(db_file_path)
+    return conn
+
+
+def dbex(conn, query):
+    """
+    Execure a DB query and return results as a list of dicts.
+    """
+    cursor = conn.cursor()
+    print("Running DB query", query)
+    cursor.execute(query)
+    results = [
+        dict(zip([col[0] for col in cursor.description], row))
+        for row in cursor.fetchall()
+    ]
+    return results
+
+
+# BASIC ORM
+################################################################################
+
+
+def dbfilter(rows, **kwargs):
+    """
+    Return all the `rows` that match the `key=value` conditions, where keys are DB column
+    names and value is a row's value.
+    """
+    selected = []
+    for row in rows:
+        accept = True
+        for key, value in kwargs.items():
+            if key not in row or row[key] != value:
+                accept = False
+        if accept:
+            selected.append(row)
+    return selected
+
+
+def filter_key_in_values(rows, key, values):
+    """
+    Return all the `rows` whose value for `key` is in the list `values`.
+    """
+    if isinstance(values, str):
+        values = [values]
+    return list(filter(lambda r: r[key] in values, rows))
+
+
+def dbget(rows, **kwargs):
+    """
+    Return all the `rows` that match the `key=value` conditions, where keys are DB column
+    names and value is a row's value.
+    """
+    selected = dbfilter(rows, **kwargs)
+    assert len(selected) < 2, "mulitple results found"
+    if selected:
+        return selected[0]
+    else:
+        return None
+
+
+def dbvalues_list(rows, *args, flat=False):
+    results = []
+    for row in rows:
+        result = []
+        for arg in args:
+            result.append(row[arg])
+        results.append(result)
+    if flat:
+        return [result[0] for result in results]
+    else:
+        return results
+
+
+# UTILS
+################################################################################
+
+
+def sane_group_by(items, key):
+    """
+    Wrapper for itertools.groupby to make it easier to use.
+    Returns a dict with keys = possible values of key in items
+    and corresponding values being lists of items that have that key.
+    """
+    sorted_items = sorted(items, key=itemgetter(key))
+    return dict((k, list(g)) for k, g in groupby(sorted_items, key=itemgetter(key)))
+
+
+def count_values_for_attr(rows, *attrs):
+    counts = {}
+    for attr in attrs:
+        counts[attr] = defaultdict(int)
+        for row in rows:
+            val = row[attr]
+            counts[attr][val] += 1
+    return counts
+
+
+# KOLIBRI CHANNEL
+################################################################################
+
+
+def get_channel(channel_id):
+    db_file_path = download_db_file(channel_id)
+    conn = sqlite3.connect(db_file_path)
+    return dbex(conn, "SELECT * FROM content_channelmetadata;")[0]
+
+
+def get_nodes_by_id(conn, attach_files=True, attach_assessments=True):
+    nodes = dbex(conn, "SELECT * FROM content_contentnode;")
+    # TODO: load tags from content_contentnode_tags and content_contenttag
+    # TODO: load content_contentnode_has_prerequisite, content_contentnode_related
+    nodes_by_id = {}
+    for node in nodes:
+        nodes_by_id[node["id"]] = node
+    if attach_files:
+        # attach all the files associated with each node under the key "files"
+        files = get_files(conn)
+        local_files = get_local_files(conn)
+        local_file_lookup = {}
+        for local_file in local_files:
+            local_file_lookup[local_file["id"]] = local_file
+        for file in files:
+            node_id = file["contentnode_id"]
+            node = nodes_by_id[node_id]
+            local_file = local_file_lookup[file["local_file_id"]]
+            file["extension"] = local_file["extension"]
+            file["checksum"] = local_file["id"]
+            if "files" in node:
+                node["files"].append(file)
+            else:
+                node["files"] = [file]
+    if attach_assessments:
+        assessmentmetadata = get_assessmentmetadata(conn)
+        for aim in assessmentmetadata:
+            node = nodes_by_id[aim["contentnode_id"]]
+            # attach assesment_ids direclty to node to imitate ricecooker/studio
+            node["assessment_item_ids"] = json.loads(aim["assessment_item_ids"])
+            node["assessmentmetadata"] = {
+                "number_of_assessments": aim["number_of_assessments"],
+                "mastery_model": aim["mastery_model"],
+                "randomize": aim["randomize"],
+                "is_manipulable": aim["is_manipulable"],
+            }
+    return nodes_by_id
+
+
+def get_nodes_for_remote_files(channel_id):
+    try:
+        db_file_path = download_db_file(channel_id)
+        conn = sqlite3.connect(db_file_path)
+        return get_nodes_by_id(conn, attach_files=True, attach_assessments=False)
+    except Exception:
+        return {}
+
+def get_other_files(conn):
+    files = dbex(conn, "SELECT content_file.local_file_id, content_file.extension, content_file.preset, content_file.contentnode_id, content_contentnode.content_id FROM content_file INNER JOIN content_contentnode ON content_file.contentnode_id=content_contentnode.id;")
+    return files
+
+
+def get_files(conn):
+    files = dbex(conn, "SELECT * FROM content_file;")
+    return files
+
+
+def get_local_files(conn):
+    localfiles = dbex(conn, "SELECT * FROM content_localfile;")
+    return localfiles
+
+
+def get_assessmentmetadata(conn):
+    assessmentmetadata = dbex(conn, "SELECT * FROM content_assessmentmetadata;")
+    return assessmentmetadata
+
+
+def get_tree(conn):
+    """
+    Return a complete JSON tree of the entire channel.
+    """
+    nodes_by_id = get_nodes_by_id(conn)
+    nodes = nodes_by_id.values()
+    sorted_nodes = sorted(
+        nodes, key=lambda n: (n["parent_id"] or "0" * 32, n["sort_order"])
+    )
+    root = sorted_nodes[0]
+    for node in sorted_nodes[1:]:
+        parent = nodes_by_id[node["parent_id"]]
+        if "children" in parent:
+            parent["children"].append(node)
+        else:
+            parent["children"] = [node]
+        import ipdb;ipdb.set_trace()
+    return root
+
+
+# NODE_ID UTILS
+################################################################################
+
+
+def node_id_from_source_ids(source_domain, channel_source_id, source_ids):
+    """
+    Compute the node_id (str) for the node whose path is  `source_ids` (list)
+    in a channel identified by `source_domain` and `channel_source_id`.
+    """
+    domain_namespace = uuid.uuid5(uuid.NAMESPACE_DNS, source_domain)
+    content_ids = [
+        uuid.uuid5(domain_namespace, source_id).hex for source_id in source_ids
+    ]
+    print("computed content_ids =", content_ids)
+    channel_id = uuid.uuid5(domain_namespace, channel_source_id)
+    print("Computed channel_id =", channel_id.hex)
+    node_id = channel_id
+    for content_id in content_ids:
+        node_id = uuid.uuid5(node_id, content_id)
+    return node_id.hex
+
+
+# TREE PRINTING
+################################################################################
+
+CONTENT_KINDS = [
+    "topic",
+    "video",
+    "audio",
+    "exercise",
+    "document",
+    "slideshow",
+    "h5p",
+    "html5",
+]
+
+
+def get_stats(subtree):
+    """
+    Recusively compute kind-counts and total file_size (non-deduplicated).
+    """
+    if "children" in subtree and subtree["children"]:
+        stats = dict((kind, 0) for kind in CONTENT_KINDS)
+        stats["topic"] = 1  # count self
+        stats["size"] = 0
+        for child in subtree["children"]:
+            child_stats = get_stats(child)
+            for k, v in child_stats.items():
+                stats[k] += v
+        return stats
+    else:
+        size = sum([f["file_size"] for f in subtree["files"]])
+        return {subtree["kind"]: 1, "size": size}
+
+
+def stats_to_str(stats):
+    stats_str = "  "
+    for key in CONTENT_KINDS:
+        if key in stats and stats[key]:
+            if stats[key] > 1:
+                stats_str += str(stats[key]) + " " + key + "s, "
+            else:
+                stats_str += str(stats[key]) + " " + key + ", "
+    size_mb_str = "%.2f" % (float(stats["size"]) / 1024 / 1024) + "MB"
+    stats_str += size_mb_str
+    return stats_str
+
+
+def print_subtree(subtree, level=0, extrakeys=None, maxlevel=2, printstats=True):
+    extra = ""
+    if level > maxlevel:
+        return
+    if extrakeys:
+        for key in extrakeys:
+            extra = extra + " " + key + "=" + subtree[key]
+    if printstats:
+        stats = get_stats(subtree)
+        extra += stats_to_str(stats)
+    title = subtree["title"].replace("\n", " ")
+    print(" " * 2 * level + "   -", title + " (" + subtree["id"] + ")", extra)
+    if "children" in subtree:
+        for child in subtree["children"]:
+            print_subtree(
+                child,
+                level=level + 1,
+                extrakeys=extrakeys,
+                maxlevel=maxlevel,
+                printstats=printstats,
+            )
+
+
+# TREE EXPORT
+################################################################################
+
+
+def export_kolibri_json_tree(
+    channel_id=None, db_file_path=None, suffix="", server="production", update=False
+):
+    """
+    Convert a channel from Kolibri database file to a JSON tree.
+    """
+    if channel_id is None and db_file_path is None:
+        raise ValueError("Need to specify either channel_id or db_file_path")
+
+    if db_file_path:
+        conn = dbconnect(db_file_path)
+    else:
+        db_file_path = download_db_file(channel_id, server=server, update=update)
+        conn = dbconnect(db_file_path)
+
+    kolibri_tree = get_tree(conn)
+    conn.close()
+
+    if db_file_path:
+        pre_filename = db_file_path.split(os.pathsep)[-1].replace(".sqlite3", "")
+        json_filename = pre_filename + suffix + ".json"
+    else:
+        json_filename = channel_id + suffix + ".json"
+
+    with open(json_filename, "w") as jsonf:
+        json.dump(kolibri_tree, jsonf, indent=2, ensure_ascii=False, sort_keys=True)
+    print("Channel exported as Kolibri JSON Tree in " + json_filename)
+
+
+# HTML EXPORTS
+################################################################################
+
+KOLIBRI_TREE_HTMLEXPORT_DIR = "reports/kolibrihtmltrees"
+
+
+def export_kolibritree_as_html(kolibritree, maxlevel=7):
+    """
+    Export `kolibritree` as HTML for inspection of contents.
+    """
+    basedir = KOLIBRI_TREE_HTMLEXPORT_DIR
+    if not os.path.exists(basedir):
+        os.makedirs(basedir, exist_ok=True)
+    channel_id = kolibritree["id"]
+    path_md = os.path.join(basedir, "channel_{}_tree.md".format(channel_id))
+    path_html = os.path.join(basedir, "channel_{}_tree.html".format(channel_id))
+
+    with io.StringIO() as buf, redirect_stdout(buf):
+        print("# Kolibri Topic Tree for channel", channel_id)
+        print("")
+        print_subtree(kolibritree, maxlevel=maxlevel)
+        output_md = buf.getvalue()
+        with open(path_md, "w") as mdfile:
+            mdfile.write(output_md)
+
+    subprocess.call(["pandoc", "--from", "gfm", path_md, "-o", path_html])
+    print("Saved", path_html)
+    os.remove(path_md)
+
+
+# CLI
+################################################################################
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Kolibri channel topic tree viewer")
+    parser.add_argument("--channel_id", required=True, help="Channel ID")
+    parser.add_argument("--printmaxlevel", type=int, default=2, help="print tree depth")
+    parser.add_argument(
+        "--htmlexport", action="store_true", help="save topic tree as html"
+    )
+    parser.add_argument("--htmlmaxlevel", type=int, default=7, help="html tree depth")
+    parser.add_argument(
+        "--update", action="store_true", help="Force re-download of DB file"
+    )
+
+    args = parser.parse_args()
+
+    db_file_path = download_db_file(args.channel_id, update=args.update)
+    conn = dbconnect(db_file_path)
+    kolibritree = get_tree(conn)
+
+    # PRINT IN TERMINAL
+    print_subtree(kolibritree, maxlevel=args.printmaxlevel)
+
+    # HTML TREE EXPORT
+    if args.htmlexport:
+        export_kolibritree_as_html(kolibritree, maxlevel=args.htmlmaxlevel)
diff --git a/pradigi/scripts/html5_test_server.py b/pradigi/scripts/html5_test_server.py
new file mode 100644
index 00000000..7850dd84
--- /dev/null
+++ b/pradigi/scripts/html5_test_server.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+"""
+Tiny local server for hands-on testing of PraDigi HTML5 apps on a real
+Android device. Lists every html5 node from the Kolibri-imported DB,
+serves the (Android-fixed) zip contents for each, and prints the LAN URL
+to open on your tablet.
+
+    ./scripts/html5_test_server.py            # port 8080
+    ./scripts/html5_test_server.py --port 9000
+
+Open the printed URL on the tablet; tap an app title to launch it.
+"""
+import argparse
+import http.server
+import os
+import socket
+import socketserver
+import sqlite3
+import tempfile
+import threading
+import urllib.parse
+import zipfile
+
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from fresh_chef import KOLIBRI_DB, storage_path, prepare_html5_zip
+
+
+EXTRACT_ROOT = tempfile.mkdtemp(prefix="pradigi-html5-test-")
+
+
+def lan_ip():
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    try:
+        s.connect(("8.8.8.8", 80))
+        return s.getsockname()[0]
+    finally:
+        s.close()
+
+
+def list_html5_apps():
+    """Return [(node_id, title, zip_checksum, language)] sorted by title."""
+    conn = sqlite3.connect(KOLIBRI_DB)
+    rows = conn.execute(
+        """
+        SELECT n.id, n.title, f.local_file_id, n.lang_id
+        FROM content_contentnode n
+        JOIN content_file f ON f.contentnode_id=n.id
+        WHERE n.kind='html5' AND f.extension='zip' AND n.available=1
+        GROUP BY n.id
+        ORDER BY n.title
+        """
+    ).fetchall()
+    return rows
+
+
+def ensure_extracted(checksum):
+    """Extract the fixed zip to EXTRACT_ROOT/{checksum}/ on first request."""
+    dest = os.path.join(EXTRACT_ROOT, checksum)
+    if os.path.isdir(dest):
+        return dest
+    src = prepare_html5_zip(checksum)
+    with zipfile.ZipFile(src) as zf:
+        zf.extractall(dest)
+    return dest
+
+
+class Handler(http.server.SimpleHTTPRequestHandler):
+    apps = []
+
+    def do_GET(self):
+        path = urllib.parse.unquote(self.path)
+        if path == "/" or path == "/index":
+            self.send_listing()
+            return
+        if path.startswith("/app/"):
+            parts = path.lstrip("/").split("/", 2)
+            if len(parts) < 2:
+                self.send_error(404)
+                return
+            checksum = parts[1]
+            subpath = parts[2] if len(parts) > 2 else "index.html"
+            try:
+                root = ensure_extracted(checksum)
+            except Exception as e:
+                self.send_error(500, f"extract failed: {e}")
+                return
+            file_path = os.path.join(root, subpath)
+            if not os.path.isfile(file_path):
+                self.send_error(404, f"not found: {subpath}")
+                return
+            # serve from the extracted directory with subpath resolution
+            self.path = "/" + subpath
+            self.directory = root
+            # Cheap single-file serve since SimpleHTTPRequestHandler uses self.directory
+            return http.server.SimpleHTTPRequestHandler.do_GET(self)
+        self.send_error(404)
+
+    def translate_path(self, path):
+        # SimpleHTTPRequestHandler uses cwd; redirect to extracted app root
+        if hasattr(self, "directory") and self.directory:
+            return os.path.join(self.directory, path.lstrip("/"))
+        return super().translate_path(path)
+
+    def send_listing(self):
+        body = [
+            "<!doctype html><meta charset=utf-8><title>PraDigi HTML5 apps</title>",
+            "<meta name=viewport content='width=device-width,initial-scale=1'>",
+            "<style>body{font-family:system-ui;max-width:720px;margin:0 auto;padding:1em}"
+            "li{margin:.4em 0;line-height:1.6}input{width:100%;padding:.5em;font-size:1em}</style>",
+            f"<h1>PraDigi apps ({len(self.apps)})</h1>",
+            "<input id=q placeholder='filter by title…' oninput='f()'>",
+            "<ul id=list>",
+        ]
+        for node_id, title, checksum, lang in self.apps:
+            safe_title = (
+                title.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+            )
+            body.append(
+                f'<li><a href="/app/{checksum}/index.html">{safe_title}</a> '
+                f'<small>[{lang or "?"}]</small></li>'
+            )
+        body.append("</ul>")
+        body.append(
+            "<script>function f(){var q=document.getElementById('q').value.toLowerCase();"
+            "document.querySelectorAll('#list li').forEach(function(li){"
+            "li.style.display=li.textContent.toLowerCase().includes(q)?'':'none';})}</script>"
+        )
+        html = "".join(body).encode("utf-8")
+        self.send_response(200)
+        self.send_header("Content-Type", "text/html; charset=utf-8")
+        self.send_header("Content-Length", str(len(html)))
+        self.end_headers()
+        self.wfile.write(html)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--port", type=int, default=8080)
+    args = ap.parse_args()
+
+    Handler.apps = list_html5_apps()
+    ip = lan_ip()
+    print(f"Serving {len(Handler.apps)} apps on:")
+    print(f"  http://{ip}:{args.port}/       ← open this on your Android tablet")
+    print(f"  http://localhost:{args.port}/  ← or here from desktop")
+    print(f"extracting to {EXTRACT_ROOT}")
+
+    # Allow LAN access
+    class ReusableServer(socketserver.ThreadingTCPServer):
+        allow_reuse_address = True
+
+    with ReusableServer(("0.0.0.0", args.port), Handler) as httpd:
+        httpd.serve_forever()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pradigi/scripts/scan_all_zips.py b/pradigi/scripts/scan_all_zips.py
new file mode 100644
index 00000000..00550f3d
--- /dev/null
+++ b/pradigi/scripts/scan_all_zips.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+"""
+Pre-flight scan of every chefdata/zipfiles/{hash}/webroot.zip.
+
+Reports anything that would fail ricecooker's HTML5 validation:
+  - missing webroot.zip entirely
+  - zip that isn't a valid zip file
+  - zip with no entries (empty EOCD)
+  - zip missing index.html
+  - index.html with a <body> that has no text and no children
+
+Used before a smoke run to enumerate all problem zips at once, rather
+than fixing one per crash.
+"""
+import os
+import sys
+import zipfile
+
+import html5lib
+
+ROOT = "chefdata/zipfiles"
+
+
+def scan_zip(zp):
+    try:
+        with zipfile.ZipFile(zp) as zf:
+            names = zf.namelist()
+            if not names:
+                return "empty_zip"
+            if "index.html" not in names:
+                return "no_index_html"
+            try:
+                idx = zf.read("index.html")
+            except Exception as e:
+                return f"unreadable_index: {e}"
+            try:
+                dom = html5lib.parse(idx, namespaceHTMLElements=False)
+            except Exception as e:
+                return f"parse_error: {e}"
+            body = dom.find("body")
+            if body is None:
+                return "no_body"
+            body_children = [
+                c for c in body.iter()
+                if isinstance(c.tag, str) and c.tag != "body"
+            ]
+            body_text = (body.text or "").strip()
+            if not body_text and not body_children:
+                return "empty_body"
+    except zipfile.BadZipFile as e:
+        return f"bad_zip: {e}"
+    except Exception as e:
+        return f"error: {e}"
+    return None
+
+
+def main():
+    hashes = sorted(
+        d for d in os.listdir(ROOT)
+        if os.path.isdir(os.path.join(ROOT, d)) and len(d) == 32
+    )
+    print(f"scanning {len(hashes)} hash dirs under {ROOT}")
+    problems = []
+    for h in hashes:
+        zp = os.path.join(ROOT, h, "webroot.zip")
+        if not os.path.exists(zp):
+            problems.append((h, "missing_webroot"))
+            continue
+        reason = scan_zip(zp)
+        if reason:
+            problems.append((h, reason))
+    print(f"\n{len(problems)} problem zips:")
+    for h, reason in problems:
+        print(f"  {h}  {reason}")
+    return 0 if not problems else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/pradigi/scripts/verify_zipfix.py b/pradigi/scripts/verify_zipfix.py
new file mode 100644
index 00000000..eba1fa62
--- /dev/null
+++ b/pradigi/scripts/verify_zipfix.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+"""
+Spot-check that zipfix.py has been applied across chefdata/zipfiles/.
+
+Samples a random subset of {hash}/webroot.zip files, extracts them in memory,
+and counts:
+  - files containing `Utils.mobileDeviceFlag=true`   (should be 0 — offending code)
+  - files containing `Utils.mobileDeviceFlag=false`  (the fixed marker)
+  - files starting with `b'` which indicates the old buggy write corrupted them
+
+Throwaway script, not committed. Run from the worktree root.
+"""
+import os
+import random
+import sys
+import zipfile
+from collections import Counter
+
+ROOT = "chefdata/zipfiles"
+SAMPLE_SIZE = int(os.environ.get("SAMPLE", 15))
+OFFENDING = b"Utils.mobileDeviceFlag=true"
+FIXED = b"Utils.mobileDeviceFlag=false"
+
+
+def scan_zip(zip_path):
+    """Return (offending_hits, fixed_hits, corrupted_files) across .js/.html inside zip."""
+    off = fix = corrupt = 0
+    try:
+        with zipfile.ZipFile(zip_path) as zf:
+            for info in zf.filelist:
+                if not info.filename.endswith((".js", ".html")):
+                    continue
+                with zf.open(info) as f:
+                    head = f.read(4)
+                    rest = f.read()
+                content = head + rest
+                if head.startswith(b"b'"):
+                    corrupt += 1
+                if OFFENDING in content:
+                    off += 1
+                if FIXED in content:
+                    fix += 1
+    except zipfile.BadZipFile:
+        return None
+    return off, fix, corrupt
+
+
+def main():
+    all_hashes = [
+        d for d in os.listdir(ROOT)
+        if os.path.isdir(os.path.join(ROOT, d)) and len(d) == 32
+    ]
+    print(f"Found {len(all_hashes)} hash dirs under {ROOT}")
+
+    sample = random.sample(all_hashes, min(SAMPLE_SIZE, len(all_hashes)))
+    totals = Counter()
+    missing_webroot = 0
+    bad_zips = []
+
+    for h in sample:
+        zp = os.path.join(ROOT, h, "webroot.zip")
+        if not os.path.exists(zp):
+            missing_webroot += 1
+            continue
+        result = scan_zip(zp)
+        if result is None:
+            bad_zips.append(zp)
+            continue
+        off, fix, corrupt = result
+        totals["offending"] += off
+        totals["fixed"] += fix
+        totals["corrupted"] += corrupt
+        flag = " <<<" if (off or corrupt) else ""
+        print(f"  {h}: offending={off}, fixed={fix}, corrupt={corrupt}{flag}")
+
+    print()
+    print(f"Sample size: {len(sample)}")
+    print(f"Missing webroot.zip: {missing_webroot}")
+    print(f"BadZipFile errors:   {len(bad_zips)}")
+    print(f"Totals across sample: {dict(totals)}")
+    if totals["offending"] or totals["corrupted"]:
+        print("FAIL — zipfix output looks incomplete.")
+        sys.exit(1)
+    print("OK — no offending code and no bytestring corruption in sample.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pradigi/sushichef.py b/pradigi/sushichef.py
index bb9f0803..58b570b3 100755
--- a/pradigi/sushichef.py
+++ b/pradigi/sushichef.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+# CONSTANTS USED TO SELECT APPROPRIATE CLASS DURING DESERIALIZATION FROM JSON
+################################################################################
 """
 The PraDigi chef uses a mix of content from three sources:
 
@@ -23,13 +24,17 @@
 import requests
 import shutil
 
-from le_utils.constants import content_kinds, file_types, licenses
+from le_utils.constants import content_kinds, file_types, licenses, format_presets
 from le_utils.constants.languages import getlang
 from ricecooker.chefs import JsonTreeChef
 from ricecooker.classes.licenses import get_license
+from ricecooker.classes.files import RemoteFile
 from ricecooker.config import LOGGER
 from ricecooker.utils.caching import (FileCache, CacheControlAdapter)
-from ricecooker.utils.jsontrees import write_tree_to_json_tree
+from ricecooker.utils.jsontrees import write_tree_to_json_tree, add_files, add_questions, read_tree_from_json
+from ricecooker.exceptions import raise_for_invalid_channel
+
+from kolibridb import get_nodes_for_remote_files, dbconnect, get_local_files, get_files
 
 from structure import GAMENAME_KEY, TAKE_FROM_KEY
 from structure import TEMPLATE_FOR_LANG
@@ -40,13 +45,19 @@
 from transform import get_zip_file
 from transform import get_phet_zip_file
 from corrections import should_skip_file
+from ricecooker.classes import nodes
+
+from le_utils.constants import exercises
+from le_utils.constants import roles
+
 
+FAIL_FILES = []
 
 
 
 PRADIGI_DOMAIN = 'prathamopenschool.org'
-PRADIGI_SOURCE_ID__VARIANT_PRATHAM = 'pradigi-videos-and-games'  # Pratham internal 
-PRADIGI_SOURCE_ID__VARIANT_LE = 'pradigi-channel'                # Studio PUBLIC channel
+PRADIGI_SOURCE_ID__VARIANT_PRATHAM = 'pradigi-channel'
+PRADIGI_SOURCE_ID__VARIANT_LE = 'pradigi-channel'
 FULL_DOMAIN_URL = 'https://www.' + PRADIGI_DOMAIN
 PRADIGI_LICENSE = get_license(licenses.CC_BY_NC_SA, copyright_holder='PraDigi').as_dict()
 PRADIGI_WEBSITE_LANGUAGES = ['hi', 'mr', 'en', 'gu', 'kn', 'bn', 'ur', 'or', 'pnb', 'ta', 'te', 'as']
@@ -59,10 +70,10 @@
 
 # In debug mode, only one topic is downloaded.
 LOGGER.setLevel(logging.DEBUG)
-DEBUG_MODE = True  # source_urls in content desriptions
+DEBUG_MODE = False # source_urls in content desriptions
 
 # WebCache logic (downloaded web resources cached for one day -- good for dev)
-cache = FileCache('.webcache')
+cache = FileCache('/home/jacob/LE/.pradigi-webcache')
 basic_adapter = CacheControlAdapter(cache=cache)
 develop_adapter = CacheControlAdapter(heuristic=OneDayCache(), cache=cache)
 session = requests.Session()
@@ -70,6 +81,37 @@
 session.mount('https://www.' + PRADIGI_DOMAIN, develop_adapter)
 
 
+TOPIC_NODE = content_kinds.TOPIC
+VIDEO_NODE = content_kinds.VIDEO
+AUDIO_NODE = content_kinds.AUDIO
+EXERCISE_NODE = content_kinds.EXERCISE
+DOCUMENT_NODE = content_kinds.DOCUMENT
+HTML5_NODE = content_kinds.HTML5
+SLIDESHOW_NODE = content_kinds.SLIDESHOW
+
+
+# TODO(Ivan): add constants.file_types to le_utils and discuss with Jordan
+
+VIDEO_FILE = file_types.VIDEO
+AUDIO_FILE = file_types.AUDIO
+DOCUMENT_FILE = file_types.DOCUMENT
+EPUB_FILE = file_types.EPUB
+HTML5_FILE = file_types.HTML5
+THUMBNAIL_FILE = file_types.THUMBNAIL
+SUBTITLES_FILE = file_types.SUBTITLES
+SLIDESHOW_IMAGE_FILE = file_types.SLIDESHOW_IMAGE
+REMOTE_FILE = "remote_file"
+
+
+INPUT_QUESTION = exercises.INPUT_QUESTION
+MULTIPLE_SELECTION = exercises.MULTIPLE_SELECTION
+SINGLE_SELECTION = exercises.SINGLE_SELECTION
+FREE_RESPONSE = exercises.FREE_RESPONSE
+PERSEUS_QUESTION = exercises.PERSEUS_QUESTION
+
+
+PRADIGI_CHANNEL_ID = "e832106c639854e181616015a8b87910"
+
 # SOURCE WEBSITES
 ################################################################################
 PRADIGI_LANG_URL_MAP = {
@@ -400,7 +442,7 @@
 def get_subtree_by_subject_en(lang, subject):
     if lang not in PRADIGI_LANG_URL_MAP:
         raise ValueError('Language `lang` must be in PRADIGI_LANG_URL_MAP')
-    wrt_filename = 'chefdata/trees/pradigi_{}_web_resource_tree.json'.format(lang)
+    wrt_filename = 'chefdata/vader/trees/pradigi_{}_web_resource_tree.json'.format(lang)
     with open(wrt_filename) as jsonfile:
         web_resource_tree = json.load(jsonfile)
     subject_subtrees = web_resource_tree['children']
@@ -425,7 +467,7 @@ def get_subtree_by_source_id(lang, source_id):
     """
     if lang not in PRADIGI_LANG_URL_MAP:
         raise ValueError('Language `lang` must be in PRADIGI_LANG_URL_MAP')
-    wrt_filename = 'chefdata/trees/pradigi_{}_web_resource_tree.json'.format(lang)
+    wrt_filename = 'chefdata/vader/trees/pradigi_{}_web_resource_tree.json'.format(lang)
     with open(wrt_filename) as jsonfile:
         web_resource_tree = json.load(jsonfile)
     # setup recusive find function
@@ -450,13 +492,15 @@ def wrt_to_ricecooker_tree(tree, lang, filter_fn=lambda node: True):
     """
     kind = tree['kind']
     if kind in ['topic_page', 'subtopic_page', 'lesson_page', 'fun_page', 'story_page', 'special_subtopic_page']:
-        thumbnail = tree['thumbnail_url'] if 'thumbnail_url' in tree else None
+        thumbnail = None
+
+
         topic_node = dict(
             kind=content_kinds.TOPIC,
             source_id=tree['source_id'],
             language=lang,
             title=tree['title'],  # or could get from Strings based on subject_en...
-            description='source_id=' + tree['source_id'] if DEBUG_MODE else '',
+            description='',  #source_id=' + tree['source_id'] if DEBUG_MODE else '',
             thumbnail=thumbnail,
             license=PRADIGI_LICENSE,
             children=[],
@@ -485,7 +529,7 @@ def wrt_to_ricecooker_tree(tree, lang, filter_fn=lambda node: True):
             source_id=tree['source_id'],
             language=lang,
             title=tree['title'],
-            description=tree.get('description', ''),
+            description='',
             thumbnail=thumbnail,
             license=PRADIGI_LICENSE,
             files=[],
@@ -506,22 +550,26 @@ def wrt_to_ricecooker_tree(tree, lang, filter_fn=lambda node: True):
     elif kind == 'PrathamZipResource':
         if should_skip_file(tree['url']):
             return None  # Skip games marked with the `SKIP GAME` correction actions
-        thumbnail = tree['thumbnail_url'] if 'thumbnail_url' in tree else None
+
+        thumbnail = None # tree['thumbnail_url'] if 'thumbnail_url' in tree else None
+
         html5_node = dict(
             kind=content_kinds.HTML5,
             source_id=tree['source_id'],
             language=lang,
             title=tree['title'],
-            description=tree.get('description', ''),
+            description='', #tree.get('description', ''),
             thumbnail=thumbnail,
             license=PRADIGI_LICENSE,
             files=[],
         )
+
         if 'phet.zip' in tree['url']:
             zip_tmp_path  = get_phet_zip_file(tree['url'], tree['main_file'])
         else:
             zip_tmp_path  = get_zip_file(tree['url'], tree['main_file'])
         if zip_tmp_path is None:
+            FAIL_FILES.append(tree['url'])
             raise ValueError('Could not get zip file from %s' % tree['url'])
         html5zip_file = dict(
             file_type=file_types.HTML5,
@@ -538,7 +586,7 @@ def wrt_to_ricecooker_tree(tree, lang, filter_fn=lambda node: True):
             source_id=tree['source_id'],
             language=lang,
             title=tree['title'],
-            description=tree.get('description', ''),
+            description='', #tree.get('description', ''),
             thumbnail=thumbnail,
             license=PRADIGI_LICENSE,
             files=[],
@@ -586,7 +634,7 @@ def find_games_for_lang(name, lang, take_from=None):
     language_en = PRADIGI_STRINGS[lang]['language_en'] # ???
 
     # load website game web resource data
-    WEBSITE_GAMES_OUTPUT = 'chefdata/trees/website_games_all_langs.json'
+    WEBSITE_GAMES_OUTPUT = 'chefdata/vader/trees/website_games_all_langs.json'
     website_data = json.load(open(WEBSITE_GAMES_OUTPUT, 'r'))
     if lang in website_data:
         website_data_lang = website_data[lang]
@@ -611,7 +659,6 @@ def find_games_for_lang(name, lang, take_from=None):
 
     if len(games) == 0:
         pass
-        # print('game', name, 'not found for lang', lang)
 
     return games
 
@@ -629,9 +676,9 @@ def website_game_webresouce_to_ricecooker_node(lang, web_resource):
         source_id=web_resource['source_id'],
         language=lang,
         title=web_resource['title'],
-        description='source_url=' + web_resource['url'] if DEBUG_MODE else '',
+        description='',
         license=PRADIGI_LICENSE,
-        thumbnail=web_resource.get('thumbnail_url'),
+        thumbnail=None,# web_resource.get('thumbnail_url'),
         files=[],
     )
     zip_tmp_path = get_zip_file(web_resource['url'], web_resource['main_file'])
@@ -642,6 +689,7 @@ def website_game_webresouce_to_ricecooker_node(lang, web_resource):
             language=lang,
         )
         game_node['files'].append(zip_file)
+        game_node['md5hash'] = zip_tmp_path.split('/')[-1]
         LOGGER.debug('Created HTML5AppNode for game ' + web_resource['title'])
         return game_node
     else:
@@ -692,7 +740,7 @@ def extract_website_games_from_tree(lang):
     if lang not in PRADIGI_LANG_URL_MAP:
         raise ValueError('Language `lang` must be in PRADIGI_LANG_URL_MAP')
     # READ IN
-    wrt_filename = 'chefdata/trees/pradigi_{}_web_resource_tree.json'.format(lang)
+    wrt_filename = 'chefdata/vader/trees/pradigi_{}_web_resource_tree.json'.format(lang)
     with open(wrt_filename) as jsonfile:
         web_resource_tree = json.load(jsonfile)
     # PROCESS
@@ -713,7 +761,6 @@ def recursive_extract_website_games(subtree):
                         child_url = child_url.replace('https://www.prathamopenschool.org/CourseContent/Games/', '')
                         child_url = child_url.replace('http://www.prathamopenschool.org/CourseContent/Games/', '')
                         child['title_en'] = child_url.replace('.zip', '')
-                        print('EXTRACTED game name', child['title_en'], 'form url', child['url'])
                         website_games.append(child)
                     else:
                         # leave other games where they are
@@ -729,10 +776,400 @@ def recursive_extract_website_games(subtree):
     recursive_extract_website_games(web_resource_tree)
     return website_games
 
+# use local file id to get storage file
+def zip_storage_path(filename):
+    return f"storage/{filename[0]}/{filename[1]}/{filename}.zip"
+
+_SQLITE_PATH = f"{PRADIGI_CHANNEL_ID}.sqlite3"
+if os.path.exists(_SQLITE_PATH):
+    conn = dbconnect(_SQLITE_PATH)
+    FILES = get_files(conn)
+    REMOTE_NODES = get_nodes_for_remote_files(PRADIGI_CHANNEL_ID)
+else:
+    LOGGER.warning(
+        "Channel DB %s not found; RemoteNode thumbnails will not be populated."
+        % _SQLITE_PATH
+    )
+    conn = None
+    FILES = []
+    REMOTE_NODES = {}
+LOCAL_FILES = dict()
+
+def _set_thumbnail_file(self, preset):
+    content_node_id = self.get_node_id().hex
+    remote_node_files = REMOTE_NODES.get(content_node_id, {}).get('files', [])
+    for file in remote_node_files:
+        if file['preset'] == preset:
+            remote_file = RemoteFile(
+                file["checksum"],
+                file["extension"],
+                file["preset"],
+                is_primary=False
+            )
+            self.add_file(remote_file)
+            self.thumbnail = remote_file
+
+
+
+
+nodes.TopicNode._set_thumbnail_file = _set_thumbnail_file
+nodes.HTML5AppNode._set_thumbnail_file = _set_thumbnail_file
+nodes.StudioContentNode._set_thumbnail_file = _set_thumbnail_file
+
+
+# Shim ricecooker >=0.8 HTML5 validation: upstream assumes body.text is always
+# a string, but BeautifulSoup returns None when <body> has only child elements
+# (no direct text node). Replace with a None-safe version and log offending
+# paths so Section 3 can investigate.
+from ricecooker.utils.pipeline.convert import HTML5ConversionHandler
+from ricecooker.utils.pipeline.exceptions import InvalidFileException
+import html5lib
+from html5lib.html5parser import ParseError
+
+
+def _validate_archive_safe(self, path: str):
+    with self.open_and_verify_archive(path) as zf:
+        index_html = self.read_file_from_archive(zf, "index.html")
+        try:
+            dom = html5lib.parse(index_html, namespaceHTMLElements=False)
+            body = dom.find("body")
+            if body is None:
+                raise InvalidFileException(
+                    f"File {path} is not a valid HTML5 file, index.html is missing a body element."
+                )
+            body_children = [
+                c for c in body.iter()
+                if isinstance(c.tag, str) and c.tag != "body"
+            ]
+            body_text = (body.text or "").strip()
+            if not body_text and not body_children:
+                raise InvalidFileException(
+                    f"File {path} is not a valid HTML5 file, index.html is empty."
+                )
+            if body.text is None and not body_text:
+                LOGGER.info("html5-shim: body.text was None but had %d children in %s" % (len(body_children), path))
+        except ParseError:
+            raise InvalidFileException(
+                f"File {path} is not a valid HTML5 file, index.html is not well-formed."
+            )
+
+
+HTML5ConversionHandler.validate_archive = _validate_archive_safe
+
+def build_tree_from_json(parent_node, sourcetree):
+    """
+    Recusively parse nodes in the list `sourcetree` and add them as children
+    to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`.
+    """
+    EXPECTED_NODE_TYPES = [
+        TOPIC_NODE,
+        VIDEO_NODE,
+        AUDIO_NODE,
+        EXERCISE_NODE,
+        DOCUMENT_NODE,
+        HTML5_NODE,
+        SLIDESHOW_NODE,
+    ]
+    """
+    workout the node ID from the source_id -- node_id is based on tree position & source_id
+
 
+    Allow a JSON tree to accept a RemoteNode (aka StudioContentNode)
+    if kind == REMOTE_NODE:
 
+        child_node = nodes.RemoteNode(
+            source_id=source_node["source_id"],
+        )
+
+        child_node.get_node_id() # returns uid, call .hex?
+    """
+    print("building tree from json")
+
+    for source_node in sourcetree:
+        kind = source_node["kind"]
+        if kind not in EXPECTED_NODE_TYPES:
+            LOGGER.critical("Unexpected node kind found: " + kind)
+            raise NotImplementedError("Unexpected node kind found in json data.")
+
+        if kind == TOPIC_NODE:
+            child_node = nodes.TopicNode(
+                source_id=source_node.get("source_id", None),
+                title=source_node["title"],
+                description=source_node.get("description"),
+                author=source_node.get("author"),
+                aggregator=source_node.get("aggregator"),
+                provider=source_node.get("provider"),
+                # no role for topics (computed dynaically from descendants)
+                language=source_node.get("language"),
+                #thumbnail=source_node.get("thumbnail"),
+                derive_thumbnail=source_node.get("derive_thumbnail", False),
+                tags=source_node.get("tags"),
+            )
+            parent_node.add_child(child_node)
+            for i in range(len(parent_node.children)):
+                if parent_node.children[i].get_node_id().hex == child_node.get_node_id().hex:
+                    for j in range(len(parent_node.children[i].files)):
+                        if parent_node.children[i].files[j].preset == format_presets.TOPIC_THUMBNAIL:
+                            removed = parent_node.children[i].files.pop(j)
+                            logging.info("Removed thumbnail file from {}".format(parent_node.children[i].title))
+                    parent_node.children[i]._set_thumbnail_file(format_presets.TOPIC_THUMBNAIL)
+            source_tree_children = source_node.get("children", [])
+            build_tree_from_json(child_node, source_tree_children)
+
+        elif kind == VIDEO_NODE:
+            child_node = nodes.VideoNode(
+                source_id=source_node["source_id"],
+                title=source_node["title"],
+                description='',
+                license=get_license(**source_node["license"]),
+                author=source_node.get("author"),
+                aggregator=source_node.get("aggregator"),
+                provider=source_node.get("provider"),
+                role=source_node.get("role", roles.LEARNER),
+                language=source_node.get("language"),
+                thumbnail=source_node.get("thumbnail"),
+                derive_thumbnail=source_node.get("derive_thumbnail", False),
+                tags=source_node.get("tags"),
+            )
+
+            add_files(child_node, source_node.get("files") or [])
+            parent_node.add_child(child_node)
+            node_id = child_node.get_node_id().hex
+
+            # But, remove the child if it's the same ID so we can replace it w/ the remote node
+            removed = None
+            for i in range(len(parent_node.children)):
+                if parent_node.children[i].get_node_id().hex == node_id:
+                    removed = parent_node.children.pop(i)
+                    break
+            # use remote node if something was removed, if nothing was removed, we're done
+            if removed:
+                remote_node = nodes.StudioContentNode(
+                    PRADIGI_CHANNEL_ID,
+                    source_node_id=node_id,
+                    source_content_id=removed.get_content_id().hex,
+                    title=source_node["title"],
+                    description='',
+                )
+                remote_node.source_id = source_node["source_id"]
+                parent_node.add_child(remote_node)
+            #node_id = child_node.get_node_id().hex
+
+            #add_files(child_node, source_node.get("files") or [])
+            #for i in range(len(parent_node.children)):
+                #if parent_node.children[i].get_node_id().hex == node_id:
+                    #parent_node.children.pop(i)
+                    #break
+            # figure out how to remove the child - parent_node.children.pop()
+            # hard code channel_id to make it work on my staging channel
+            #child_node = nodes.StudioContentNode(PRADIGI_CHANNEL_ID, source_node_id=node_id, title=source_node["title"])
+            #import ipdb;ipdb.set_trace()
+            #child_node.source_id = source_node["source_id"]
+            #parent_node.add_child(child_node)
+
+        elif kind == AUDIO_NODE:
+            child_node = nodes.AudioNode(
+                source_id=source_node["source_id"],
+                title=source_node["title"],
+                description='',
+                license=get_license(**source_node["license"]),
+                author=source_node.get("author"),
+                aggregator=source_node.get("aggregator"),
+                provider=source_node.get("provider"),
+                role=source_node.get("role", roles.LEARNER),
+                language=source_node.get("language"),
+                thumbnail=source_node.get("thumbnail"),
+                derive_thumbnail=source_node.get("derive_thumbnail", False),
+                tags=source_node.get("tags"),
+            )
+            add_files(child_node, source_node.get("files") or [])
+            parent_node.add_child(child_node)
+            node_id = child_node.get_node_id().hex
+
+            # But, remove the child if it's the same ID so we can replace it w/ the remote node
+            removed = None
+            for i in range(len(parent_node.children)):
+                if parent_node.children[i].get_node_id().hex == node_id:
+                    removed = parent_node.children.pop(i)
+                    break
+            # use remote node if something was removed, if nothing was removed, we're done
+            if removed:
+                remote_node = nodes.StudioContentNode(
+                    PRADIGI_CHANNEL_ID,
+                    source_node_id=node_id,
+                    source_content_id=removed.get_content_id().hex,
+                    title=source_node["title"],
+                    description='',
+                )
+                remote_node.source_id = source_node["source_id"]
+                parent_node.add_child(remote_node)
+
+        elif kind == EXERCISE_NODE:
+            child_node = nodes.ExerciseNode(
+                source_id=source_node["source_id"],
+                title=source_node["title"],
+                description='',
+                license=get_license(**source_node["license"]),
+                author=source_node.get("author"),
+                aggregator=source_node.get("aggregator"),
+                provider=source_node.get("provider"),
+                role=source_node.get("role", roles.LEARNER),
+                language=source_node.get("language"),
+                thumbnail=source_node.get("thumbnail"),
+                derive_thumbnail=source_node.get(
+                    "derive_thumbnail", False
+                ),  # not supported yet
+                tags=source_node.get("tags"),
+                exercise_data=source_node.get("exercise_data"),
+                questions=[],
+            )
+            add_questions(child_node, source_node.get("questions") or [])
+            parent_node.add_child(child_node)
+            node_id = child_node.get_node_id().hex
+
+            # But, remove the child if it's the same ID so we can replace it w/ the remote node
+            removed = None
+            for i in range(len(parent_node.children)):
+                if parent_node.children[i].get_node_id().hex == node_id:
+                    removed = parent_node.children.pop(i)
+                    break
+            # use remote node if something was removed, if nothing was removed, we're done
+            if removed:
+                remote_node = nodes.StudioContentNode(
+                    PRADIGI_CHANNEL_ID,
+                    source_node_id=node_id,
+                    source_content_id=removed.get_content_id().hex,
+                    title=source_node["title"],
+                    description='',
+                )
+                remote_node.source_id = source_node["source_id"]
+                parent_node.add_child(remote_node)
+
+        elif kind == DOCUMENT_NODE:
+            child_node = nodes.DocumentNode(
+                source_id=source_node["source_id"],
+                title=source_node["title"],
+                description='',
+                license=get_license(**source_node["license"]),
+                author=source_node.get("author"),
+                aggregator=source_node.get("aggregator"),
+                provider=source_node.get("provider"),
+                role=source_node.get("role", roles.LEARNER),
+                language=source_node.get("language"),
+                thumbnail=source_node.get("thumbnail"),
+                tags=source_node.get("tags"),
+            )
+            # Things are just normal around here...
+            add_files(child_node, source_node.get("files") or [])
+            parent_node.add_child(child_node)
+            node_id = child_node.get_node_id().hex
+
+            # But, remove the child if it's the same ID so we can replace it w/ the remote node
+            removed = None
+            for i in range(len(parent_node.children)):
+                if parent_node.children[i].get_node_id().hex == node_id:
+                    removed = parent_node.children.pop(i)
+                    break
+            # use remote node if something was removed, if nothing was removed, we're done
+            if removed:
+                remote_node = nodes.StudioContentNode(
+                    PRADIGI_CHANNEL_ID,
+                    source_node_id=node_id,
+                    source_content_id=removed.get_content_id().hex,
+                    title=source_node["title"],
+                    description='',
+                )
+                remote_node.source_id = source_node["source_id"]
+                parent_node.add_child(remote_node)
+
+        elif kind == HTML5_NODE:
+            child_node = nodes.HTML5AppNode(
+                source_id=source_node["source_id"],
+                title=source_node["title"],
+                description='',
+                license=get_license(**source_node["license"]),
+                author=source_node.get("author"),
+                aggregator=source_node.get("aggregator"),
+                provider=source_node.get("provider"),
+                role=source_node.get("role", roles.LEARNER),
+                language=source_node.get("language"),
+                #thumbnail=source_node.get("thumbnail"),
+                #derive_thumbnail=source_node.get("derive_thumbnail", False),
+                tags=source_node.get("tags"),
+            )
+
+            # get the contentnode_id from the file's path
+            zip_file = [z for z in source_node.get("files", []) if z["file_type"] == "html5"][0]
+            files_to_add = [f for f in source_node.get("files") if f["file_type"] != "html5"]
+            files_to_add.append(zip_file)
+            add_files(child_node, files_to_add)
+            #try:
+            #except Exception as e:
+                #LOGGER.info(f"NO FILE FOUND: {contentnode_id} {zip_file} {contentnode_id in LOCAL_FILES}")
+                #LOGGER.info(e)
+
+            parent_node.add_child(child_node)
+
+            # But, remove the child if it's the same ID so we can replace it w/ the remote node
+            #for i in range(len(parent_node.children)):
+                #if parent_node.children[i].get_node_id().hex == node_id:
+                    #removed = parent_node.children.pop(i)
+                    #break
+            # use remote node if something was removed, if nothing was removed, we're done
+            #if removed:
+                #remote_node = nodes.StudioContentNode(PRADIGI_CHANNEL_ID, source_node_id=node_id, title=source_node["title"], description='')
+                #for i in range(len(remote_node.files)):
+                    #if remote_node.files[i].preset == format_presets.HTML5_THUMBNAIL:
+                        #remote_node.files.pop(i)
+                        #logging.info("Removed thumbnail file from HTML5 APP {}".format(parent_node.children[i].title))
+                #remote_node.source_id = source_node["source_id"]
+                #parent_node.add_child(remote_node)
+                #remote_node._set_thumbnail_file(format_presets.HTML5_THUMBNAIL)
+
+        elif kind == SLIDESHOW_NODE:
+            child_node = nodes.SlideshowNode(
+                source_id=source_node["source_id"],
+                title=source_node["title"],
+                description='',
+                license=get_license(**source_node["license"]),
+                author=source_node.get("author"),
+                aggregator=source_node.get("aggregator"),
+                provider=source_node.get("provider"),
+                role=source_node.get("role", roles.LEARNER),
+                language=source_node.get("language"),
+                thumbnail=source_node.get("thumbnail"),
+                derive_thumbnail=source_node.get("derive_thumbnail", False),
+                tags=source_node.get("tags"),
+            )
+            add_files(child_node, source_node.get("files") or [])
+            parent_node.add_child(child_node)
+            node_id = child_node.get_node_id().hex
+
+            # But, remove the child if it's the same ID so we can replace it w/ the remote node
+            removed = None
+            for i in range(len(parent_node.children)):
+                if parent_node.children[i].get_node_id().hex == node_id:
+                    removed = parent_node.children.pop(i)
+                    break
+            # use remote node if something was removed, if nothing was removed, we're done
+            if removed:
+                remote_node = nodes.StudioContentNode(
+                    PRADIGI_CHANNEL_ID,
+                    source_node_id=node_id,
+                    source_content_id=removed.get_content_id().hex,
+                    title=source_node["title"],
+                    description='',
+                )
+                remote_node.source_id = source_node["source_id"]
+                parent_node.add_child(remote_node)
+
+        # TODO: add support for H5P content kind
 
+        else:
+            LOGGER.critical("Encountered an unknown kind: " + str(source_node))
+            continue
 
+    return parent_node
 
 
 
@@ -743,7 +1180,7 @@ class PraDigiChef(JsonTreeChef):
     """
     SushiChef script for importing and merging the content from these sources:
       - Video, PDFs, and interactive demos from http://www.prathamopenschool.org/
-      - Games from http://www.prathamopenschool.org/ 
+      - Games from http://www.prathamopenschool.org/
     """
     RICECOOKER_JSON_TREE = 'pradigi_ricecooker_json_tree.json'
 
@@ -753,7 +1190,7 @@ def crawl(self, args, options):
         Crawl website and save web resource trees in chefdata/trees/.
         """
         from pradigi_crawlers import PraDigiCrawler
-        
+
         # website
         for lang in PRADIGI_WEBSITE_LANGUAGES:
             website_crawler = PraDigiCrawler(lang=lang)
@@ -764,15 +1201,27 @@ def crawl(self, args, options):
         for lang in PRADIGI_WEBSITE_LANGUAGES:
             lang_games = extract_website_games_from_tree(lang)
             website_games[lang] = lang_games
-        WEBSITE_GAMES_OUTPUT = 'chefdata/trees/website_games_all_langs.json'
+        WEBSITE_GAMES_OUTPUT = 'chefdata/vader/trees/website_games_all_langs.json'
         # Save website games
         with open(WEBSITE_GAMES_OUTPUT, 'w') as json_file:
             json.dump(website_games, json_file, ensure_ascii=False, indent=2, sort_keys=True)
 
+    def construct_channel(self, **kwargs):
+        """
+        Build the channel tree by adding TopicNodes and ContentNode children.
+        """
+        channel = self.get_channel(**kwargs)
+        json_tree_path = self.get_json_tree_path(**kwargs)
+        json_tree = read_tree_from_json(json_tree_path)
+        build_tree_from_json(channel, json_tree["children"])
+        ## Create video node, get node ID, create RemoteNode in its place
+        ## vendor build_tree_from_json -- know vidoe & doc nodes need to be remove nodes
+        raise_for_invalid_channel(channel)
+        return channel
 
     def build_subtree_for_lang(self, lang):
         LOGGER.info('Building subtree for lang {}'.format(lang))
-        
+
         lang_subtree = copy.deepcopy(TEMPLATE_FOR_LANG)
         lang_obj = getlang(lang)
         language_en = PRADIGI_STRINGS[lang]['language_en']
@@ -874,9 +1323,10 @@ def build_subtree_for_lang(self, lang):
             # TODO: check for empty sub-folders too
             age_groups_subtree['children'] = nonempty_subject_subtrees
 
-            # Special handling for '3-6 years' children:
-            # Replace the contents of the first child (KhelBadi) with its children
-            # then append the any remaining nodes in this age group
+            # Flatten the '3-6 years' agre group to contain contents of KhelBadi
+            # if age_groups_subtree['title'] == '3-6 years' and len(age_groups_subtree['children']) == 1:
+            #     khelbadi_subtree = age_groups_subtree['children'][0]
+            #     age_groups_subtree['children'] = khelbadi_subtree['children']
             if age_groups_subtree['title'] == '3-6 years' and age_groups_subtree['children']:
                 new_children = []
                 khelbadi_subtree = age_groups_subtree['children'][0]
@@ -884,6 +1334,11 @@ def build_subtree_for_lang(self, lang):
                 other_subtrees = age_groups_subtree['children'][1:]
                 new_children.extend(other_subtrees)
                 age_groups_subtree['children'] = new_children
+                # flat_subfolders = []
+                # for folder in age_groups_subtree['children']:
+                #     for subfolder in folder['children']:
+                #         flat_subfolders.append(subfolder)
+                # age_groups_subtree['children'] = flat_subfolders
 
         return lang_subtree
 
@@ -895,7 +1350,7 @@ def pre_run(self, args, options):
         LOGGER.info('in pre_run...')
 
         # delete .zip files in temporary dir when running using update
-        if args['update']:
+        if False or args['update']:
             LOGGER.info('Deleting all zips in cache dir {}'.format(HTML5APP_ZIPS_LOCAL_DIR))
             for rel_path in os.listdir(HTML5APP_ZIPS_LOCAL_DIR):
                 abs_path = os.path.join(HTML5APP_ZIPS_LOCAL_DIR, rel_path)
@@ -903,12 +1358,12 @@ def pre_run(self, args, options):
                     shutil.rmtree(abs_path)
 
         # option to skip crawling stage
-        if 'nocrawl' not in options:
-            self.crawl(args, options)
+        #if 'nocrawl' not in options:
+            #self.crawl(args, options)
 
         # Conditionally determine `source_id` depending on variant specified
         if 'variant' in options and options['variant'].upper() == 'LE':
-            # Official PraDigi channel = 
+            # Official PraDigi channel =
             channel_name = 'PraDigi'
             channel_source_id = PRADIGI_SOURCE_ID__VARIANT_LE
             DEBUG_MODE = False
diff --git a/pradigi/transform.py b/pradigi/transform.py
index e1fa47af..4e5b2aee 100644
--- a/pradigi/transform.py
+++ b/pradigi/transform.py
@@ -72,134 +72,48 @@ def get_zip_file(zip_file_url, main_file):
     THe `main_file` needs to be renamed to index.html to make it compatible with Kolibri.
     """
     key = zip_file_url + main_file
-    destpath = make_temporary_dir_from_key(key)
-    
-    # Check for "REPLACE WITH:" correction rule for the current `zip_file_url`
-    replacement_url = should_replace_with(zip_file_url)
-    if replacement_url:
-        zip_file_url = replacement_url
-
-    # return cached version if already there
+    key_bytes = key.encode('utf-8')
+    m = hashlib.md5()
+    m.update(key_bytes)
+    subdir = m.hexdigest()
+    destpath = os.path.join("chefdata/zipfiles/", subdir)
     final_webroot_path = os.path.join(destpath, 'webroot.zip')
     if os.path.exists(final_webroot_path):
-        return final_webroot_path
+        # Existence isn't enough: we've seen 22-byte empty zips in the cache
+        # that pass this check but fail later at ricecooker validation with a
+        # confusing "No required format preset" error. Verify the zip has at
+        # least one entry before returning its path.
+        try:
+            with zipfile.ZipFile(final_webroot_path) as zf:
+                names = zf.namelist()
+                if not names:
+                    LOGGER.error(
+                        "get_zip_file: %s exists but is an EMPTY zip (url=%s)"
+                        % (final_webroot_path, zip_file_url)
+                    )
+                elif "index.html" not in names:
+                    LOGGER.error(
+                        "get_zip_file: %s has no index.html at root (url=%s). "
+                        "top entries: %s"
+                        % (final_webroot_path, zip_file_url, names[:3])
+                    )
+                else:
+                    return final_webroot_path
+        except zipfile.BadZipFile as e:
+            LOGGER.error(
+                "get_zip_file: %s exists but is NOT a valid zip: %s (url=%s)"
+                % (final_webroot_path, e, zip_file_url)
+            )
     else:
-        LOGGER.error("Now we need local files so we can process them: %s" % final_webroot_path)
-
-    try:
-        download_file(zip_file_url, destpath, request_fn=make_request)
-
-        zip_filename = zip_file_url.split('/')[-1]         # e.g. Mathematics.zip
-        zip_basename = zip_filename.rsplit('.', 1)[0]      # e.g. Mathematics/
-
-        # July 31: handle ednge cases where zip filename doesn't match folder name inside it
-        awazchitras = ['Awazchitra_HI', 'Awazchitra_TL', 'Awazchitra_KN',
-            'Awazchitra_BN', 'Awazchitra_OD', 'Awazchitra_PN', 'Awazchitra_TM']
-        for awazchitra in awazchitras:
-            if awazchitra in zip_basename:
-                zip_basename = zip_basename.replace('Awazchitra', 'AwazChitra')
-        if '_KKS_Hi' in zip_basename:
-            zip_basename = zip_basename.replace('_KKS_Hi', '_KKS_HI')
-
-        # Mar 2: more edge cases where zip filename doesn't match folder name inside it
-        if 'Memorygamekb' in zip_basename:
-            zip_basename = zip_basename.replace('Memorygamekb', 'MemoryGamekb')
-        if 'cityofstories' in zip_basename:
-            zip_basename = zip_basename.replace('cityofstories', 'CityOfStories')
-
-        # Jun 12: fix more edge cases where .zip filename doesn't match dir name
-        if '_KKS_Gj' in zip_basename:
-            zip_basename = zip_basename.replace('_KKS_Gj', '_KKS_GJ')
-        if 'ShabdKhel' in zip_basename:
-            zip_basename = zip_basename.replace('ShabdKhel', 'Shabdkhel')
-
-        zip_folder = os.path.join(destpath, zip_basename)  # e.g. destpath/Mathematics/
-        main_file = main_file.split('/')[-1]               # e.g. activity_name.html or index.html
-
-        if 'KhelbadiKahaniyan_MR' in zip_basename:
-            # Inconsistency --- `main_file` contains dir name, and not index.html
-            main_file = 'index.html'
-
-        # Jul 8th: handle weird case-insensitive webserver main_file
-        if main_file == 'mainexpand.html':
-            main_file = 'mainExpand.html'  # <-- this is the actual filename in the zip
-
-        # Zip files from Pratham website have the web content inside subfolder
-        # of the same as the zip filename. We need to recreate these zip files
-        # to make sure the index.html is in the root of the zip.
-        local_zip_file = os.path.join(destpath, zip_filename)
-        with zipfile.ZipFile(local_zip_file) as zf:
-            # If main_file is in the root (like zips from the game repository)
-            # then we need to extract the zip contents to subfolder zip_basename/
-            for zfileinfo in zf.filelist:
-                if zfileinfo.filename == main_file:
-                    destpath = os.path.join(destpath, zip_basename)
-            # Extract zip so main file will be in destpath/zip_basename/index.html
-            zf.extractall(destpath)
-
-        # In some cases, the files are under the www directory,
-        # let's move them up one level.
-        www_dir = os.path.join(zip_folder, 'www')
-        if os.path.isdir(www_dir):
-            files = os.listdir(www_dir)
-            for f in files:
-                shutil.move(os.path.join(www_dir, f), zip_folder)
-
-        # Rename `main_file` to index.html
-        src = os.path.join(zip_folder, main_file)
-        dest = os.path.join(zip_folder, 'index.html')
-        os.rename(src, dest)
-
-        # Logic to add margin-top:44px; for games that match Corrections tab
-        add_margin_top = False
-        for row in PRADIGI_CORRECTIONS_LIST:
-            if row[CORRECTIONS_ACTION_KEY] == ADD_MARGIN_TOP_ACTION:
-                pat = row[CORRECTIONS_SOURCE_URL_PAT_KEY]
-                m = pat.match(zip_file_url)
-                if m:
-                    add_margin_top = True
-        if add_margin_top:
-            if zip_file_url.endswith('CourseContent/Games/Mathematics.zip'):
-                LOGGER.info("adding body.margin-top:44px; to ALL .html files in: %s" % zip_file_url)
-                for root, dirs, files in os.walk(zip_folder):
-                    for file in files:
-                        if file.endswith(".html"):
-                            add_body_margin_top(root, file)
-            else:
-                LOGGER.info("adding body.margin-top:44px; to index.html in: %s" % zip_file_url)
-                add_body_margin_top(zip_folder, 'index.html')
-
-        # Replace occurences of `main_file` with index.html to avoid broken links
-        for root, dirs, files in os.walk(zip_folder):
-            for file in files:
-                if file.endswith(".html") or file.endswith(".js"):
-                    file_path = os.path.join(root, file)
-                    # use bytes to avoid Unicode errors "invalid start/continuation byte"
-                    bytes_in = open(file_path, 'rb').read()
-                    bytes_out = bytes_in.replace(main_file.encode('utf-8'), b'index.html')
-                    open(file_path, 'wb').write(bytes_out)
-
-        for root, dirs, files in os.walk(zip_folder):
-            for file in files:
-                if file.endswith(".js"):
-                    LOGGER.info("Fixing Android bug in JS file: %s" % file)
-                    with open(file, 'w') as f:
-                        content = f.read()
-                        content = content.replace(
-                            'Utils.mobileDeviceFlag=true', 
-                            'Utils.mobileDeviceFlag=false'
-                        )
-                        f.write(content)
-                        f.close()
-        # create the zip file and copy it to 
-        tmp_predictable_zip_path = create_predictable_zip(zip_folder)
-        shutil.copyfile(tmp_predictable_zip_path, final_webroot_path)
-        return final_webroot_path
-
-    except Exception as e:
-        LOGGER.error("get_zip_file: %s, %s, %s, %s" %
-                     (zip_file_url, main_file, destpath, e))
-        return None
+        LOGGER.error("get_zip_file: no webroot.zip at %s (url=%s, main_file=%s)"
+                     % (final_webroot_path, zip_file_url, main_file))
+        if os.path.isdir(destpath):
+            for root, dirs, files in os.walk(destpath):
+                if files:
+                    LOGGER.error("  dir contents: %s" % files)
+        else:
+            LOGGER.error("  hash dir does not exist at all")
+    return None
 
 
 PHET_INDEX_HTML_TEMPLATE = """
diff --git a/pradigi/zipfix.py b/pradigi/zipfix.py
new file mode 100644
index 00000000..e00cef81
--- /dev/null
+++ b/pradigi/zipfix.py
@@ -0,0 +1,113 @@
+import os
+import shutil
+import tempfile
+import zipfile
+import ntpath
+import logging
+from ricecooker.utils.zip import create_predictable_zip
+
+logging.basicConfig(level=logging.INFO, filename="zipfix.log")
+
+LIKELY_OFFENDING = "Utils.mobileDeviceFlag"
+OFFENDING_CODE = "Utils.mobileDeviceFlag=true"
+FIXED_CODE = "Utils.mobileDeviceFlag=false"
+
+def apply_fix_better(filepath, dryrun=False):
+    """
+    Traverse files and replace offending code with fixed code, handling Unicode issues.
+
+    Args:
+        filepath (str): Path to the directory or file.
+        dryrun (bool): If True, do not modify files.
+    """
+    for root, dirs, files in os.walk(filepath):
+        for file in files:
+            if file.endswith((".js", ".html")):
+                file_path = os.path.join(root, file)
+                content = None
+
+                # Attempt to read the file with utf-8 encoding
+                try:
+                    with open(file_path, "r", encoding="utf-8") as f:
+                        content = f.read()
+
+                # Handle UnicodeDecodeError by trying an alternate encoding
+                except UnicodeDecodeError:
+                    logging.warning("UTF-8 decoding failed for %s. Trying unicode_escape.", file_path)
+                    try:
+                        with open(file_path, "r", encoding="unicode_escape") as f:
+                            content = f.read()
+                    except UnicodeDecodeError:
+                        logging.error("Cannot decode %s with either UTF-8 or unicode_escape. Skipping.", file_path)
+                        continue
+
+                # If content was successfully read, perform the fix
+                if content:
+                    #logging.info("First chars of {}: {}".format(file_path, content[0:100]))
+
+                    # Fix bytestring corruption: files that start with b' and end with '
+                    if content.startswith("b'") and len(content) > 2:
+                        logging.info("Fixing bytestring corruption in %s", file_path)
+                        # Strip b' prefix
+                        content = content[2:]
+                        # Strip trailing ' if present
+                        if content.endswith("'"):
+                            content = content[:-1]
+                        # Decode escape sequences like \r\n, \t, etc.
+                        try:
+                            content = content.encode('utf-8').decode('unicode_escape')
+                        except UnicodeDecodeError:
+                            logging.error("Failed to decode escape sequences in %s", file_path)
+
+                    if FIXED_CODE in content:
+                        logging.info("Previously fixed: %s", file_path)
+
+                    if OFFENDING_CODE in content:
+                        logging.info("Fixing mobile device flag in %s", file_path)
+                        content = content.replace(OFFENDING_CODE, FIXED_CODE)
+
+                    if not dryrun:
+                        with open(file_path, "w", encoding="utf-8") as f:
+                            logging.info("Writing back... {}".format(file_path))
+                            f.write(content)
+        # No need for manual recursion - os.walk already handles subdirectories
+
+def fix_zips(path, dryrun=False):
+    """
+    Process a zip file, extract, fix offending code, and repackage.
+
+    Args:
+        path (str): Path to the zip file.
+        dryrun (bool): If True, do not modify files.
+    """
+    zip_filename = ntpath.basename(path)
+    extract_path = tempfile.mkdtemp()
+
+
+    try:
+        # Extract zip file contents
+        with zipfile.ZipFile(path) as zf:
+            zf.extractall(extract_path)
+        logging.info("Extracted %s to %s", path, extract_path)
+
+        # Apply fixes to the extracted files
+        apply_fix_better(extract_path, dryrun)
+
+    except zipfile.BadZipFile:
+        logging.error("%s is not a valid zip file. Skipping.", zip_filename)
+
+    finally:
+        # Ensure cleanup of the temporary directory
+        tmp_predictable_zip_path = create_predictable_zip(extract_path)
+        shutil.copyfile(tmp_predictable_zip_path, path)
+        shutil.rmtree(extract_path)
+        os.remove(tmp_predictable_zip_path)
+        logging.info("Cleaned up temporary directory %s", extract_path)
+
+# Main loop to process all zip files in the target directory
+for root, dirs, files in os.walk('./chefdata/zipfiles/'):
+    for name in files:
+        if name.endswith('.zip'):
+            path = os.path.join(root, name)
+            logging.info("Processing zip file: %s", path)
+            fix_zips(path, dryrun=False)  # Set dryrun=True for testin