Skip to content

Commit d3e5f0a

Browse files
Cleanup leftover CV app.js code (#10)
1 parent f2b4d16 commit d3e5f0a

1 file changed

Lines changed: 5 additions & 232 deletions

File tree

  • services/ws-server/static

services/ws-server/static/app.js

Lines changed: 5 additions & 232 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,6 @@ const VIDEO_INFERENCE_INTERVAL_MS = 750;
6161
const VIDEO_RENDER_SCORE_THRESHOLD = 0.35;
6262
const VIDEO_MODEL_PATH = "/static/models/video_cv.onnx";
6363
const VIDEO_FALLBACK_INPUT_SIZE = 224;
64-
const RETINAFACE_INPUT_HEIGHT = 608;
65-
const RETINAFACE_INPUT_WIDTH = 640;
66-
const RETINAFACE_CONFIDENCE_THRESHOLD = 0.75;
67-
const RETINAFACE_NMS_THRESHOLD = 0.4;
68-
const RETINAFACE_VARIANCES = [0.1, 0.2];
69-
const RETINAFACE_MIN_SIZES = [[16, 32], [64, 128], [256, 512]];
70-
const RETINAFACE_STEPS = [8, 16, 32];
71-
const RETINAFACE_MEAN_BGR = [104, 117, 123];
7264
const STORED_AGENT_ID_KEY = "ws_wasm_agent.agent_id";
7365
let currentAgentId = null;
7466

@@ -320,21 +312,6 @@ const ensureVideoOverlayContext = () => {
320312
return videoOverlayContext;
321313
};
322314

323-
const isRetinaFaceSession = (session = videoCvSession) => {
324-
if (!session) {
325-
return false;
326-
}
327-
328-
const inputNames = Array.isArray(session.inputNames) ? session.inputNames : [];
329-
const outputNames = Array.isArray(session.outputNames) ? session.outputNames : [];
330-
const allNames = inputNames.concat(outputNames).map((name) => String(name).toLowerCase());
331-
if (allNames.some((name) => name.includes("retinaface"))) {
332-
return true;
333-
}
334-
335-
return outputNames.length === 3 && inputNames.length === 1;
336-
};
337-
338315
const selectVideoModelInputName = (session) => {
339316
const inputNames = Array.isArray(session?.inputNames) ? session.inputNames : [];
340317
if (!inputNames.length) {
@@ -402,18 +379,6 @@ const resolveVideoModelLayout = () => {
402379
throw new Error("Video CV model is not loaded.");
403380
}
404381

405-
if (isRetinaFaceSession(videoCvSession)) {
406-
return {
407-
dataType: "float32",
408-
channels: 3,
409-
width: RETINAFACE_INPUT_WIDTH,
410-
height: RETINAFACE_INPUT_HEIGHT,
411-
tensorDimensions: [1, RETINAFACE_INPUT_HEIGHT, RETINAFACE_INPUT_WIDTH, 3],
412-
layout: "nhwc",
413-
profile: "retinaface",
414-
};
415-
}
416-
417382
const metadata = videoCvSession.inputMetadata?.[videoCvInputName];
418383
const dataType = metadata?.type ?? "float32";
419384
if (dataType !== "float32" && dataType !== "uint8") {
@@ -469,7 +434,6 @@ const resolveVideoModelLayout = () => {
469434
height,
470435
tensorDimensions: [1, channels, height, width],
471436
layout: "nchw",
472-
profile: "generic",
473437
};
474438
}
475439

@@ -485,7 +449,6 @@ const resolveVideoModelLayout = () => {
485449
height,
486450
tensorDimensions: [1, height, width, channels],
487451
layout: "nhwc",
488-
profile: "generic",
489452
};
490453
};
491454

@@ -505,29 +468,11 @@ const buildVideoInputTensor = () => {
505468
height,
506469
tensorDimensions,
507470
layout,
508-
profile,
509471
} = resolveVideoModelLayout();
510472
const context = ensureVideoCvCanvas();
511473
videoCvCanvas.width = width;
512474
videoCvCanvas.height = height;
513-
let resizeRatio = 1;
514-
if (profile === "retinaface") {
515-
const sourceWidth = videoPreview.videoWidth;
516-
const sourceHeight = videoPreview.videoHeight;
517-
const targetRatio = height / width;
518-
if (sourceHeight / sourceWidth <= targetRatio) {
519-
resizeRatio = width / sourceWidth;
520-
} else {
521-
resizeRatio = height / sourceHeight;
522-
}
523-
524-
const resizedWidth = Math.max(1, Math.min(width, Math.round(sourceWidth * resizeRatio)));
525-
const resizedHeight = Math.max(1, Math.min(height, Math.round(sourceHeight * resizeRatio)));
526-
context.clearRect(0, 0, width, height);
527-
context.drawImage(videoPreview, 0, 0, resizedWidth, resizedHeight);
528-
} else {
529-
context.drawImage(videoPreview, 0, 0, width, height);
530-
}
475+
context.drawImage(videoPreview, 0, 0, width, height);
531476

532477
const rgba = context.getImageData(0, 0, width, height).data;
533478
const elementCount = width * height * channels;
@@ -541,14 +486,6 @@ const buildVideoInputTensor = () => {
541486
const green = rgba[rgbaIndex + 1];
542487
const blue = rgba[rgbaIndex + 2];
543488

544-
if (profile === "retinaface") {
545-
const tensorIndex = pixelIndex * channels;
546-
tensorData[tensorIndex] = blue - RETINAFACE_MEAN_BGR[0];
547-
tensorData[tensorIndex + 1] = green - RETINAFACE_MEAN_BGR[1];
548-
tensorData[tensorIndex + 2] = red - RETINAFACE_MEAN_BGR[2];
549-
continue;
550-
}
551-
552489
if (channels === 1) {
553490
const grayscale = Math.round(0.299 * red + 0.587 * green + 0.114 * blue);
554491
tensorData[pixelIndex] = dataType === "uint8" ? grayscale : grayscale / 255;
@@ -581,17 +518,7 @@ const buildVideoInputTensor = () => {
581518
}
582519
}
583520

584-
return {
585-
tensor: new window.ort.Tensor(dataType, tensorData, tensorDimensions),
586-
preprocess: {
587-
profile,
588-
inputWidth: width,
589-
inputHeight: height,
590-
resizeRatio,
591-
sourceWidth: videoPreview.videoWidth,
592-
sourceHeight: videoPreview.videoHeight,
593-
},
594-
};
521+
return new window.ort.Tensor(dataType, tensorData, tensorDimensions);
595522
};
596523

597524
const looksLikeBoxes = (tensor) => {
@@ -642,155 +569,6 @@ const normalizeBox = (boxValues, format = "xyxy") => {
642569
return normalized;
643570
};
644571

645-
const clamp = (value, min, max) => Math.max(min, Math.min(max, value));
646-
647-
const buildRetinaFacePriors = (imageHeight, imageWidth) => {
648-
const priors = [];
649-
RETINAFACE_STEPS.forEach((step, index) => {
650-
const featureMapHeight = Math.ceil(imageHeight / step);
651-
const featureMapWidth = Math.ceil(imageWidth / step);
652-
const minSizes = RETINAFACE_MIN_SIZES[index];
653-
654-
for (let row = 0; row < featureMapHeight; row += 1) {
655-
for (let column = 0; column < featureMapWidth; column += 1) {
656-
minSizes.forEach((minSize) => {
657-
priors.push([
658-
((column + 0.5) * step) / imageWidth,
659-
((row + 0.5) * step) / imageHeight,
660-
minSize / imageWidth,
661-
minSize / imageHeight,
662-
]);
663-
});
664-
}
665-
}
666-
});
667-
return priors;
668-
};
669-
670-
const decodeRetinaFaceBox = (loc, prior) => {
671-
const centerX = prior[0] + loc[0] * RETINAFACE_VARIANCES[0] * prior[2];
672-
const centerY = prior[1] + loc[1] * RETINAFACE_VARIANCES[0] * prior[3];
673-
const width = prior[2] * Math.exp(loc[2] * RETINAFACE_VARIANCES[1]);
674-
const height = prior[3] * Math.exp(loc[3] * RETINAFACE_VARIANCES[1]);
675-
return [
676-
centerX - width / 2,
677-
centerY - height / 2,
678-
centerX + width / 2,
679-
centerY + height / 2,
680-
];
681-
};
682-
683-
const computeIoU = (left, right) => {
684-
const x1 = Math.max(left.box[0], right.box[0]);
685-
const y1 = Math.max(left.box[1], right.box[1]);
686-
const x2 = Math.min(left.box[2], right.box[2]);
687-
const y2 = Math.min(left.box[3], right.box[3]);
688-
const width = Math.max(0, x2 - x1 + 1);
689-
const height = Math.max(0, y2 - y1 + 1);
690-
const intersection = width * height;
691-
const leftArea = Math.max(0, left.box[2] - left.box[0] + 1) * Math.max(0, left.box[3] - left.box[1] + 1);
692-
const rightArea = Math.max(0, right.box[2] - right.box[0] + 1) * Math.max(0, right.box[3] - right.box[1] + 1);
693-
return intersection / Math.max(1e-6, leftArea + rightArea - intersection);
694-
};
695-
696-
const applyNms = (detections, threshold) => {
697-
const sorted = [...detections].sort((left, right) => right.score - left.score);
698-
const kept = [];
699-
700-
sorted.forEach((candidate) => {
701-
if (kept.every((accepted) => computeIoU(candidate, accepted) <= threshold)) {
702-
kept.push(candidate);
703-
}
704-
});
705-
706-
return kept;
707-
};
708-
709-
const decodeRetinaFaceOutputs = (outputs, preprocess) => {
710-
if (!preprocess || preprocess.profile !== "retinaface") {
711-
return null;
712-
}
713-
714-
const outputNames = Array.isArray(videoCvSession?.outputNames) ? videoCvSession.outputNames : [];
715-
if (outputNames.length < 3) {
716-
return null;
717-
}
718-
719-
const locTensor = outputs[outputNames[0]];
720-
const confTensor = outputs[outputNames[1]];
721-
const landmTensor = outputs[outputNames[2]];
722-
if (!locTensor || !confTensor || !landmTensor) {
723-
return null;
724-
}
725-
726-
const locValues = flattenFinite(locTensor);
727-
const confValues = flattenFinite(confTensor);
728-
const landmValues = flattenFinite(landmTensor);
729-
const priorCount = locValues.length / 4;
730-
if (priorCount <= 0 || confValues.length / 2 !== priorCount || landmValues.length / 10 !== priorCount) {
731-
return null;
732-
}
733-
734-
const priors = buildRetinaFacePriors(preprocess.inputHeight, preprocess.inputWidth);
735-
if (priors.length !== priorCount) {
736-
return null;
737-
}
738-
739-
const detections = [];
740-
for (let index = 0; index < priorCount; index += 1) {
741-
const score = softmax(confValues.slice(index * 2, index * 2 + 2))[1] ?? 0;
742-
if (score < RETINAFACE_CONFIDENCE_THRESHOLD) {
743-
continue;
744-
}
745-
746-
const decoded = decodeRetinaFaceBox(
747-
locValues.slice(index * 4, index * 4 + 4),
748-
priors[index],
749-
);
750-
const scaledBox = [
751-
clamp((decoded[0] * preprocess.inputWidth) / preprocess.resizeRatio, 0, preprocess.sourceWidth),
752-
clamp((decoded[1] * preprocess.inputHeight) / preprocess.resizeRatio, 0, preprocess.sourceHeight),
753-
clamp((decoded[2] * preprocess.inputWidth) / preprocess.resizeRatio, 0, preprocess.sourceWidth),
754-
clamp((decoded[3] * preprocess.inputHeight) / preprocess.resizeRatio, 0, preprocess.sourceHeight),
755-
];
756-
757-
detections.push({
758-
label: "face",
759-
class_index: 0,
760-
score,
761-
box: scaledBox,
762-
});
763-
}
764-
765-
const filtered = applyNms(detections, RETINAFACE_NMS_THRESHOLD);
766-
if (!filtered.length) {
767-
return {
768-
mode: "detection",
769-
detections: [],
770-
detected_class: "no_detection",
771-
class_index: -1,
772-
confidence: 0,
773-
probabilities: [],
774-
top_classes: [],
775-
};
776-
}
777-
778-
const best = filtered[0];
779-
return {
780-
mode: "detection",
781-
detections: filtered,
782-
detected_class: best.label,
783-
class_index: best.class_index,
784-
confidence: best.score,
785-
probabilities: filtered.map((entry) => entry.score),
786-
top_classes: filtered.slice(0, 3).map((entry) => ({
787-
label: entry.label,
788-
index: entry.class_index,
789-
probability: entry.score,
790-
})),
791-
};
792-
};
793-
794572
const findDetectionTensor = (entries, patterns, predicate = () => true) => {
795573
return entries.find(([name, tensor]) => {
796574
const normalizedName = String(name).toLowerCase();
@@ -1007,12 +785,7 @@ const decodeClassificationOutputs = (output) => {
1007785
};
1008786
};
1009787

1010-
const summarizeVideoOutput = (outputMap, preprocess = null) => {
1011-
const retinaFaceSummary = decodeRetinaFaceOutputs(outputMap, preprocess);
1012-
if (retinaFaceSummary) {
1013-
return retinaFaceSummary;
1014-
}
1015-
788+
const summarizeVideoOutput = (outputMap) => {
1016789
const detectionSummary = decodeDetectionOutputs(outputMap);
1017790
if (detectionSummary) {
1018791
return detectionSummary;
@@ -1154,10 +927,10 @@ const inferVideoPrediction = async () => {
1154927
lastVideoInferenceAt = now;
1155928

1156929
try {
1157-
const { tensor: input, preprocess } = buildVideoInputTensor();
930+
const input = buildVideoInputTensor();
1158931
const outputMap = await videoCvSession.run({ [videoCvInputName]: input });
1159932
const output = outputMap[videoCvOutputName];
1160-
const summary = summarizeVideoOutput(outputMap, preprocess);
933+
const summary = summarizeVideoOutput(outputMap);
1161934
const labelChanged = summary.detected_class !== lastVideoCvLabel;
1162935
lastVideoCvLabel = summary.detected_class;
1163936
lastVideoInferenceSummary = summary;

0 commit comments

Comments
 (0)