|
| 1 | +"""Bulk ingestion helpers for a real-time fleet feed. |
| 2 | +
|
| 3 | +A city feed posts, on every tick, one (vehicleId, position, time) observation |
| 4 | +per vehicle as a GeoJSON Point feature or a GeoParquet row. Each observation is |
| 5 | +appended as one instant to that vehicle's moving feature, extending its |
| 6 | +`tgeompoint` trajectory in `temporal_geometries`. The geometry and temporal work |
| 7 | +run inside MobilityDB (ST_MakePoint / ST_GeomFromWKB, tgeompoint, appendInstant). |
| 8 | +""" |
| 9 | +import gzip |
| 10 | +import io |
| 11 | +import json |
| 12 | +import re |
| 13 | +import zlib |
| 14 | + |
| 15 | + |
| 16 | +def decompress(body, content_encoding): |
| 17 | + """Transparently decode a compressed request body by its Content-Encoding. |
| 18 | + gzip and deflate use the standard library; br and zstd are supported when the |
| 19 | + optional library is installed. |
| 20 | + """ |
| 21 | + enc = (content_encoding or "").lower().strip() |
| 22 | + if not enc or enc == "identity": |
| 23 | + return body |
| 24 | + if enc in ("gzip", "x-gzip"): |
| 25 | + return gzip.decompress(body) |
| 26 | + if enc == "deflate": |
| 27 | + try: |
| 28 | + return zlib.decompress(body) |
| 29 | + except zlib.error: |
| 30 | + return zlib.decompress(body, -zlib.MAX_WBITS) # raw deflate stream |
| 31 | + if enc == "br": |
| 32 | + import brotli |
| 33 | + return brotli.decompress(body) |
| 34 | + if enc == "zstd": |
| 35 | + import zstandard |
| 36 | + return zstandard.ZstdDecompressor().decompress(body) |
| 37 | + raise ValueError(f"unsupported Content-Encoding: {enc}") |
| 38 | + |
| 39 | + |
| 40 | +def srid_from_crs(crs, default=4326): |
| 41 | + """Extract an EPSG code from an OGC CRS object/string (e.g. EPSG::25832).""" |
| 42 | + if not crs: |
| 43 | + return default |
| 44 | + text = crs if isinstance(crs, str) else json.dumps(crs) |
| 45 | + m = re.search(r"EPSG\D*?(\d{4,5})", text) |
| 46 | + return int(m.group(1)) if m else default |
| 47 | + |
| 48 | + |
| 49 | +def _timestamp(feature, props): |
| 50 | + return (feature.get("when") or props.get("datetime") or props.get("timestamp") |
| 51 | + or props.get("time") or props.get("t")) |
| 52 | + |
| 53 | + |
| 54 | +def parse_geojson_points(body): |
| 55 | + """A FeatureCollection of Point features, each with an id and a timestamp, |
| 56 | + into a list of {id, x, y, t} observations plus the SRID. |
| 57 | + """ |
| 58 | + gj = json.loads(body.decode("utf-8") if isinstance(body, (bytes, bytearray)) else body) |
| 59 | + if gj.get("type") != "FeatureCollection": |
| 60 | + raise ValueError("bulk GeoJSON ingest expects a FeatureCollection") |
| 61 | + srid = srid_from_crs(gj.get("crs")) |
| 62 | + observations = [] |
| 63 | + for feat in gj.get("features", []): |
| 64 | + if feat.get("type") != "Feature": |
| 65 | + continue |
| 66 | + geom = feat.get("geometry") or {} |
| 67 | + if geom.get("type") != "Point": |
| 68 | + raise ValueError("bulk ingest expects Point geometries") |
| 69 | + coords = geom.get("coordinates") or [] |
| 70 | + if len(coords) < 2: |
| 71 | + raise ValueError("a Point needs [x, y] coordinates") |
| 72 | + props = feat.get("properties") or {} |
| 73 | + ts = _timestamp(feat, props) |
| 74 | + if ts is None: |
| 75 | + raise ValueError("each feature needs a timestamp (properties.datetime)") |
| 76 | + fid = feat.get("id") if feat.get("id") is not None else props.get("id") |
| 77 | + if fid is None: |
| 78 | + raise ValueError("each feature needs an id (the vehicle identifier)") |
| 79 | + observations.append({"id": str(fid), "x": float(coords[0]), "y": float(coords[1]), "t": str(ts)}) |
| 80 | + return observations, srid |
| 81 | + |
| 82 | + |
| 83 | +def parse_geoparquet(body, geom_col="geometry", id_col="id", time_col="ts"): |
| 84 | + """A GeoParquet byte payload (one row per observation: WKB Point, id, ts) into |
| 85 | + {id, wkb, t} observations. The WKB is handed to PostGIS, not parsed here. |
| 86 | + """ |
| 87 | + import pyarrow.parquet as pq |
| 88 | + table = pq.read_table(io.BytesIO(body)) |
| 89 | + for col in (geom_col, id_col, time_col): |
| 90 | + if col not in table.column_names: |
| 91 | + raise ValueError(f"GeoParquet is missing the '{col}' column") |
| 92 | + geoms = table.column(geom_col).to_pylist() |
| 93 | + ids = table.column(id_col).to_pylist() |
| 94 | + times = table.column(time_col).to_pylist() |
| 95 | + observations = [] |
| 96 | + for g, fid, ts in zip(geoms, ids, times): |
| 97 | + if not g: |
| 98 | + raise ValueError("GeoParquet row is missing the geometry") |
| 99 | + observations.append({"id": str(fid), "wkb": bytes(g), "t": str(ts)}) |
| 100 | + return observations, 4326 |
| 101 | + |
| 102 | + |
| 103 | +# one instant appended to a tgeompoint trajectory; the point comes either from |
| 104 | +# x/y (GeoJSON) or from WKB handed to PostGIS (GeoParquet) |
| 105 | +_INST_XY = "tgeompoint(ST_SetSRID(ST_MakePoint(%s, %s), %s), %s::timestamptz)" |
| 106 | +_INST_WKB = "tgeompoint(ST_SetSRID(ST_GeomFromWKB(%s), %s), %s::timestamptz)" |
| 107 | + |
| 108 | + |
| 109 | +def _instant(observation, srid): |
| 110 | + if "wkb" in observation: |
| 111 | + return _INST_WKB, (observation["wkb"], srid, observation["t"]) |
| 112 | + return _INST_XY, (observation["x"], observation["y"], srid, observation["t"]) |
| 113 | + |
| 114 | + |
| 115 | +def ensure_tables(cursor): |
| 116 | + cursor.execute(""" |
| 117 | + CREATE TABLE IF NOT EXISTS moving_features ( |
| 118 | + id TEXT PRIMARY KEY, |
| 119 | + collection_id TEXT REFERENCES collections(id) ON DELETE CASCADE, |
| 120 | + type TEXT DEFAULT 'Feature', |
| 121 | + geometry geometry, properties JSONB, bbox JSONB, |
| 122 | + time_range TSTZRANGE, crs JSONB, trs JSONB, |
| 123 | + created_at TIMESTAMP DEFAULT NOW())""") |
| 124 | + cursor.execute(""" |
| 125 | + CREATE TABLE IF NOT EXISTS temporal_geometries ( |
| 126 | + id SERIAL PRIMARY KEY, |
| 127 | + feature_id TEXT REFERENCES moving_features(id) ON DELETE CASCADE, |
| 128 | + collection_id TEXT REFERENCES collections(id) ON DELETE CASCADE, |
| 129 | + geometry_type TEXT, geometry geometry, trajectory tgeompoint, |
| 130 | + interpolation TEXT, base JSONB, |
| 131 | + created_at TIMESTAMP DEFAULT NOW())""") |
| 132 | + |
| 133 | + |
| 134 | +def append_observations(cursor, collection_id, observations, srid): |
| 135 | + """Append each observation as one instant, creating the feature/trajectory on |
| 136 | + first sight and extending it with appendInstant afterwards. Runs inside the |
| 137 | + caller's transaction so the whole batch commits atomically. |
| 138 | + """ |
| 139 | + created, extended = set(), 0 |
| 140 | + for o in observations: |
| 141 | + inst, args = _instant(o, srid) |
| 142 | + cursor.execute( |
| 143 | + "INSERT INTO moving_features (id, collection_id, type) VALUES (%s, %s, 'Feature') " |
| 144 | + "ON CONFLICT (id) DO NOTHING RETURNING id", (o["id"], collection_id)) |
| 145 | + if cursor.fetchone() is not None: |
| 146 | + created.add(o["id"]) |
| 147 | + cursor.execute( |
| 148 | + f"UPDATE temporal_geometries SET trajectory = appendInstant(trajectory, {inst}) " |
| 149 | + "WHERE feature_id = %s RETURNING id", (*args, o["id"])) |
| 150 | + if cursor.fetchone() is None: |
| 151 | + cursor.execute( |
| 152 | + "INSERT INTO temporal_geometries " |
| 153 | + "(feature_id, collection_id, geometry_type, trajectory, interpolation) " |
| 154 | + f"VALUES (%s, %s, 'MovingPoint', {inst}, 'Linear')", |
| 155 | + (o["id"], collection_id, *args)) |
| 156 | + extended += 1 |
| 157 | + return {"observations": extended, "featuresCreated": len(created), |
| 158 | + "featuresExtended": extended - len(created)} |
0 commit comments