@@ -61,14 +61,6 @@ const VIDEO_INFERENCE_INTERVAL_MS = 750;
6161const VIDEO_RENDER_SCORE_THRESHOLD = 0.35 ;
6262const VIDEO_MODEL_PATH = "/static/models/video_cv.onnx" ;
6363const VIDEO_FALLBACK_INPUT_SIZE = 224 ;
64- const RETINAFACE_INPUT_HEIGHT = 608 ;
65- const RETINAFACE_INPUT_WIDTH = 640 ;
66- const RETINAFACE_CONFIDENCE_THRESHOLD = 0.75 ;
67- const RETINAFACE_NMS_THRESHOLD = 0.4 ;
68- const RETINAFACE_VARIANCES = [ 0.1 , 0.2 ] ;
69- const RETINAFACE_MIN_SIZES = [ [ 16 , 32 ] , [ 64 , 128 ] , [ 256 , 512 ] ] ;
70- const RETINAFACE_STEPS = [ 8 , 16 , 32 ] ;
71- const RETINAFACE_MEAN_BGR = [ 104 , 117 , 123 ] ;
7264const STORED_AGENT_ID_KEY = "ws_wasm_agent.agent_id" ;
7365let currentAgentId = null ;
7466
@@ -320,21 +312,6 @@ const ensureVideoOverlayContext = () => {
320312 return videoOverlayContext ;
321313} ;
322314
323- const isRetinaFaceSession = ( session = videoCvSession ) => {
324- if ( ! session ) {
325- return false ;
326- }
327-
328- const inputNames = Array . isArray ( session . inputNames ) ? session . inputNames : [ ] ;
329- const outputNames = Array . isArray ( session . outputNames ) ? session . outputNames : [ ] ;
330- const allNames = inputNames . concat ( outputNames ) . map ( ( name ) => String ( name ) . toLowerCase ( ) ) ;
331- if ( allNames . some ( ( name ) => name . includes ( "retinaface" ) ) ) {
332- return true ;
333- }
334-
335- return outputNames . length === 3 && inputNames . length === 1 ;
336- } ;
337-
338315const selectVideoModelInputName = ( session ) => {
339316 const inputNames = Array . isArray ( session ?. inputNames ) ? session . inputNames : [ ] ;
340317 if ( ! inputNames . length ) {
@@ -402,18 +379,6 @@ const resolveVideoModelLayout = () => {
402379 throw new Error ( "Video CV model is not loaded." ) ;
403380 }
404381
405- if ( isRetinaFaceSession ( videoCvSession ) ) {
406- return {
407- dataType : "float32" ,
408- channels : 3 ,
409- width : RETINAFACE_INPUT_WIDTH ,
410- height : RETINAFACE_INPUT_HEIGHT ,
411- tensorDimensions : [ 1 , RETINAFACE_INPUT_HEIGHT , RETINAFACE_INPUT_WIDTH , 3 ] ,
412- layout : "nhwc" ,
413- profile : "retinaface" ,
414- } ;
415- }
416-
417382 const metadata = videoCvSession . inputMetadata ?. [ videoCvInputName ] ;
418383 const dataType = metadata ?. type ?? "float32" ;
419384 if ( dataType !== "float32" && dataType !== "uint8" ) {
@@ -469,7 +434,6 @@ const resolveVideoModelLayout = () => {
469434 height,
470435 tensorDimensions : [ 1 , channels , height , width ] ,
471436 layout : "nchw" ,
472- profile : "generic" ,
473437 } ;
474438 }
475439
@@ -485,7 +449,6 @@ const resolveVideoModelLayout = () => {
485449 height,
486450 tensorDimensions : [ 1 , height , width , channels ] ,
487451 layout : "nhwc" ,
488- profile : "generic" ,
489452 } ;
490453} ;
491454
@@ -505,29 +468,11 @@ const buildVideoInputTensor = () => {
505468 height,
506469 tensorDimensions,
507470 layout,
508- profile,
509471 } = resolveVideoModelLayout ( ) ;
510472 const context = ensureVideoCvCanvas ( ) ;
511473 videoCvCanvas . width = width ;
512474 videoCvCanvas . height = height ;
513- let resizeRatio = 1 ;
514- if ( profile === "retinaface" ) {
515- const sourceWidth = videoPreview . videoWidth ;
516- const sourceHeight = videoPreview . videoHeight ;
517- const targetRatio = height / width ;
518- if ( sourceHeight / sourceWidth <= targetRatio ) {
519- resizeRatio = width / sourceWidth ;
520- } else {
521- resizeRatio = height / sourceHeight ;
522- }
523-
524- const resizedWidth = Math . max ( 1 , Math . min ( width , Math . round ( sourceWidth * resizeRatio ) ) ) ;
525- const resizedHeight = Math . max ( 1 , Math . min ( height , Math . round ( sourceHeight * resizeRatio ) ) ) ;
526- context . clearRect ( 0 , 0 , width , height ) ;
527- context . drawImage ( videoPreview , 0 , 0 , resizedWidth , resizedHeight ) ;
528- } else {
529- context . drawImage ( videoPreview , 0 , 0 , width , height ) ;
530- }
475+ context . drawImage ( videoPreview , 0 , 0 , width , height ) ;
531476
532477 const rgba = context . getImageData ( 0 , 0 , width , height ) . data ;
533478 const elementCount = width * height * channels ;
@@ -541,14 +486,6 @@ const buildVideoInputTensor = () => {
541486 const green = rgba [ rgbaIndex + 1 ] ;
542487 const blue = rgba [ rgbaIndex + 2 ] ;
543488
544- if ( profile === "retinaface" ) {
545- const tensorIndex = pixelIndex * channels ;
546- tensorData [ tensorIndex ] = blue - RETINAFACE_MEAN_BGR [ 0 ] ;
547- tensorData [ tensorIndex + 1 ] = green - RETINAFACE_MEAN_BGR [ 1 ] ;
548- tensorData [ tensorIndex + 2 ] = red - RETINAFACE_MEAN_BGR [ 2 ] ;
549- continue ;
550- }
551-
552489 if ( channels === 1 ) {
553490 const grayscale = Math . round ( 0.299 * red + 0.587 * green + 0.114 * blue ) ;
554491 tensorData [ pixelIndex ] = dataType === "uint8" ? grayscale : grayscale / 255 ;
@@ -581,17 +518,7 @@ const buildVideoInputTensor = () => {
581518 }
582519 }
583520
584- return {
585- tensor : new window . ort . Tensor ( dataType , tensorData , tensorDimensions ) ,
586- preprocess : {
587- profile,
588- inputWidth : width ,
589- inputHeight : height ,
590- resizeRatio,
591- sourceWidth : videoPreview . videoWidth ,
592- sourceHeight : videoPreview . videoHeight ,
593- } ,
594- } ;
521+ return new window . ort . Tensor ( dataType , tensorData , tensorDimensions ) ;
595522} ;
596523
597524const looksLikeBoxes = ( tensor ) => {
@@ -642,155 +569,6 @@ const normalizeBox = (boxValues, format = "xyxy") => {
642569 return normalized ;
643570} ;
644571
645- const clamp = ( value , min , max ) => Math . max ( min , Math . min ( max , value ) ) ;
646-
647- const buildRetinaFacePriors = ( imageHeight , imageWidth ) => {
648- const priors = [ ] ;
649- RETINAFACE_STEPS . forEach ( ( step , index ) => {
650- const featureMapHeight = Math . ceil ( imageHeight / step ) ;
651- const featureMapWidth = Math . ceil ( imageWidth / step ) ;
652- const minSizes = RETINAFACE_MIN_SIZES [ index ] ;
653-
654- for ( let row = 0 ; row < featureMapHeight ; row += 1 ) {
655- for ( let column = 0 ; column < featureMapWidth ; column += 1 ) {
656- minSizes . forEach ( ( minSize ) => {
657- priors . push ( [
658- ( ( column + 0.5 ) * step ) / imageWidth ,
659- ( ( row + 0.5 ) * step ) / imageHeight ,
660- minSize / imageWidth ,
661- minSize / imageHeight ,
662- ] ) ;
663- } ) ;
664- }
665- }
666- } ) ;
667- return priors ;
668- } ;
669-
670- const decodeRetinaFaceBox = ( loc , prior ) => {
671- const centerX = prior [ 0 ] + loc [ 0 ] * RETINAFACE_VARIANCES [ 0 ] * prior [ 2 ] ;
672- const centerY = prior [ 1 ] + loc [ 1 ] * RETINAFACE_VARIANCES [ 0 ] * prior [ 3 ] ;
673- const width = prior [ 2 ] * Math . exp ( loc [ 2 ] * RETINAFACE_VARIANCES [ 1 ] ) ;
674- const height = prior [ 3 ] * Math . exp ( loc [ 3 ] * RETINAFACE_VARIANCES [ 1 ] ) ;
675- return [
676- centerX - width / 2 ,
677- centerY - height / 2 ,
678- centerX + width / 2 ,
679- centerY + height / 2 ,
680- ] ;
681- } ;
682-
683- const computeIoU = ( left , right ) => {
684- const x1 = Math . max ( left . box [ 0 ] , right . box [ 0 ] ) ;
685- const y1 = Math . max ( left . box [ 1 ] , right . box [ 1 ] ) ;
686- const x2 = Math . min ( left . box [ 2 ] , right . box [ 2 ] ) ;
687- const y2 = Math . min ( left . box [ 3 ] , right . box [ 3 ] ) ;
688- const width = Math . max ( 0 , x2 - x1 + 1 ) ;
689- const height = Math . max ( 0 , y2 - y1 + 1 ) ;
690- const intersection = width * height ;
691- const leftArea = Math . max ( 0 , left . box [ 2 ] - left . box [ 0 ] + 1 ) * Math . max ( 0 , left . box [ 3 ] - left . box [ 1 ] + 1 ) ;
692- const rightArea = Math . max ( 0 , right . box [ 2 ] - right . box [ 0 ] + 1 ) * Math . max ( 0 , right . box [ 3 ] - right . box [ 1 ] + 1 ) ;
693- return intersection / Math . max ( 1e-6 , leftArea + rightArea - intersection ) ;
694- } ;
695-
696- const applyNms = ( detections , threshold ) => {
697- const sorted = [ ...detections ] . sort ( ( left , right ) => right . score - left . score ) ;
698- const kept = [ ] ;
699-
700- sorted . forEach ( ( candidate ) => {
701- if ( kept . every ( ( accepted ) => computeIoU ( candidate , accepted ) <= threshold ) ) {
702- kept . push ( candidate ) ;
703- }
704- } ) ;
705-
706- return kept ;
707- } ;
708-
709- const decodeRetinaFaceOutputs = ( outputs , preprocess ) => {
710- if ( ! preprocess || preprocess . profile !== "retinaface" ) {
711- return null ;
712- }
713-
714- const outputNames = Array . isArray ( videoCvSession ?. outputNames ) ? videoCvSession . outputNames : [ ] ;
715- if ( outputNames . length < 3 ) {
716- return null ;
717- }
718-
719- const locTensor = outputs [ outputNames [ 0 ] ] ;
720- const confTensor = outputs [ outputNames [ 1 ] ] ;
721- const landmTensor = outputs [ outputNames [ 2 ] ] ;
722- if ( ! locTensor || ! confTensor || ! landmTensor ) {
723- return null ;
724- }
725-
726- const locValues = flattenFinite ( locTensor ) ;
727- const confValues = flattenFinite ( confTensor ) ;
728- const landmValues = flattenFinite ( landmTensor ) ;
729- const priorCount = locValues . length / 4 ;
730- if ( priorCount <= 0 || confValues . length / 2 !== priorCount || landmValues . length / 10 !== priorCount ) {
731- return null ;
732- }
733-
734- const priors = buildRetinaFacePriors ( preprocess . inputHeight , preprocess . inputWidth ) ;
735- if ( priors . length !== priorCount ) {
736- return null ;
737- }
738-
739- const detections = [ ] ;
740- for ( let index = 0 ; index < priorCount ; index += 1 ) {
741- const score = softmax ( confValues . slice ( index * 2 , index * 2 + 2 ) ) [ 1 ] ?? 0 ;
742- if ( score < RETINAFACE_CONFIDENCE_THRESHOLD ) {
743- continue ;
744- }
745-
746- const decoded = decodeRetinaFaceBox (
747- locValues . slice ( index * 4 , index * 4 + 4 ) ,
748- priors [ index ] ,
749- ) ;
750- const scaledBox = [
751- clamp ( ( decoded [ 0 ] * preprocess . inputWidth ) / preprocess . resizeRatio , 0 , preprocess . sourceWidth ) ,
752- clamp ( ( decoded [ 1 ] * preprocess . inputHeight ) / preprocess . resizeRatio , 0 , preprocess . sourceHeight ) ,
753- clamp ( ( decoded [ 2 ] * preprocess . inputWidth ) / preprocess . resizeRatio , 0 , preprocess . sourceWidth ) ,
754- clamp ( ( decoded [ 3 ] * preprocess . inputHeight ) / preprocess . resizeRatio , 0 , preprocess . sourceHeight ) ,
755- ] ;
756-
757- detections . push ( {
758- label : "face" ,
759- class_index : 0 ,
760- score,
761- box : scaledBox ,
762- } ) ;
763- }
764-
765- const filtered = applyNms ( detections , RETINAFACE_NMS_THRESHOLD ) ;
766- if ( ! filtered . length ) {
767- return {
768- mode : "detection" ,
769- detections : [ ] ,
770- detected_class : "no_detection" ,
771- class_index : - 1 ,
772- confidence : 0 ,
773- probabilities : [ ] ,
774- top_classes : [ ] ,
775- } ;
776- }
777-
778- const best = filtered [ 0 ] ;
779- return {
780- mode : "detection" ,
781- detections : filtered ,
782- detected_class : best . label ,
783- class_index : best . class_index ,
784- confidence : best . score ,
785- probabilities : filtered . map ( ( entry ) => entry . score ) ,
786- top_classes : filtered . slice ( 0 , 3 ) . map ( ( entry ) => ( {
787- label : entry . label ,
788- index : entry . class_index ,
789- probability : entry . score ,
790- } ) ) ,
791- } ;
792- } ;
793-
794572const findDetectionTensor = ( entries , patterns , predicate = ( ) => true ) => {
795573 return entries . find ( ( [ name , tensor ] ) => {
796574 const normalizedName = String ( name ) . toLowerCase ( ) ;
@@ -1007,12 +785,7 @@ const decodeClassificationOutputs = (output) => {
1007785 } ;
1008786} ;
1009787
1010- const summarizeVideoOutput = ( outputMap , preprocess = null ) => {
1011- const retinaFaceSummary = decodeRetinaFaceOutputs ( outputMap , preprocess ) ;
1012- if ( retinaFaceSummary ) {
1013- return retinaFaceSummary ;
1014- }
1015-
788+ const summarizeVideoOutput = ( outputMap ) => {
1016789 const detectionSummary = decodeDetectionOutputs ( outputMap ) ;
1017790 if ( detectionSummary ) {
1018791 return detectionSummary ;
@@ -1154,10 +927,10 @@ const inferVideoPrediction = async () => {
1154927 lastVideoInferenceAt = now ;
1155928
1156929 try {
1157- const { tensor : input , preprocess } = buildVideoInputTensor ( ) ;
930+ const input = buildVideoInputTensor ( ) ;
1158931 const outputMap = await videoCvSession . run ( { [ videoCvInputName ] : input } ) ;
1159932 const output = outputMap [ videoCvOutputName ] ;
1160- const summary = summarizeVideoOutput ( outputMap , preprocess ) ;
933+ const summary = summarizeVideoOutput ( outputMap ) ;
1161934 const labelChanged = summary . detected_class !== lastVideoCvLabel ;
1162935 lastVideoCvLabel = summary . detected_class ;
1163936 lastVideoInferenceSummary = summary ;
0 commit comments