diff --git a/CMakeLists.txt b/CMakeLists.txt index 331ecddd4d..7fefce132a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,6 +143,9 @@ list( "${AVM_ROOT}/third_party/libyuv/source/scale_neon64.cc" "${AVM_ROOT}/third_party/libyuv/source/scale_win.cc") +list(APPEND AVM_CJSON_SOURCES "${AVM_ROOT}/third_party/cJSON/cJSON.c" + "${AVM_ROOT}/third_party/cJSON/cJSON.h") + list( APPEND AVM_SOURCES @@ -403,8 +406,14 @@ endif() if(CONFIG_AV2_DECODER AND ENABLE_EXAMPLES) add_executable( - avmdec "${AVM_ROOT}/apps/avmdec.c" $ - $) + avmdec + "${AVM_ROOT}/apps/avmdec.c" + $ + $ + "${AVM_ROOT}/common/xlayer_config.h" + "${AVM_ROOT}/common/xlayer_config_parse.c" + "${AVM_ROOT}/common/xlayer_config_parse.h" + $) target_sources(avmdec PRIVATE $) add_executable( decode_to_md5 @@ -475,9 +484,18 @@ if(CONFIG_AV2_ENCODER) if(ENABLE_EXAMPLES) add_executable( avmenc - "${AVM_ROOT}/apps/avmenc.c" $ + "${AVM_ROOT}/apps/avmenc.c" + "${AVM_ROOT}/apps/avmenc_xlayer.c" + "${AVM_ROOT}/apps/avmenc_xlayer.h" + "${AVM_ROOT}/common/xlayer_config.h" + "${AVM_ROOT}/common/xlayer_config_parse.c" + "${AVM_ROOT}/common/xlayer_config_parse.h" + "${AVM_ROOT}/common/tu_assembler.c" + "${AVM_ROOT}/common/tu_assembler.h" + $ $ - $) + $ + $) add_executable( lossless_encoder "${AVM_ROOT}/examples/lossless_encoder.c" @@ -947,6 +965,9 @@ if(ENABLE_EXAMPLES set_property(TARGET ${avm_app} PROPERTY LINKER_LANGUAGE CXX) endforeach() endif() + + # cJSON library for JSON config parsing (used by multi-xlayer encoder) + add_library(avm_cjson OBJECT ${AVM_CJSON_SOURCES}) endif() if(ENABLE_TESTS) diff --git a/apps/avmdec.c b/apps/avmdec.c index 1d80f28558..ee1600a777 100644 --- a/apps/avmdec.c +++ b/apps/avmdec.c @@ -45,6 +45,8 @@ #include "common/rawenc.h" #include "common/y4menc.h" +#include "common/xlayer_config.h" +#include "common/xlayer_config_parse.h" #if CONFIG_LIBYUV #include "third_party/libyuv/include/libyuv/scale.h" @@ -52,6 +54,58 @@ static const char *exec_name; +// Buffered frame for flush reordering in interleaved output mode. +typedef struct FlushFrame { + avm_image_t *img; // Allocated deep copy of the decoded image + unsigned int order_hint; // display_order_hint for sorting + int xlayer_id; + int mlayer_id; +} FlushFrame; + +static int compare_flush_frames(const void *a, const void *b) { + const FlushFrame *fa = (const FlushFrame *)a; + const FlushFrame *fb = (const FlushFrame *)b; + if (fa->order_hint != fb->order_hint) + return (fa->order_hint < fb->order_hint) ? -1 : 1; + if (fa->xlayer_id != fb->xlayer_id) + return (fa->xlayer_id < fb->xlayer_id) ? -1 : 1; + if (fa->mlayer_id != fb->mlayer_id) + return (fa->mlayer_id < fb->mlayer_id) ? -1 : 1; + return 0; +} + +// Deep-copy an avm_image_t: allocate a new image and copy pixel data. +static avm_image_t *deep_copy_image(const avm_image_t *src) { + avm_image_t *dst = avm_img_alloc(NULL, src->fmt, src->d_w, src->d_h, 32); + if (!dst) return NULL; + dst->bit_depth = src->bit_depth; + dst->monochrome = src->monochrome; + dst->csp = src->csp; + dst->range = src->range; + dst->cp = src->cp; + dst->tc = src->tc; + dst->mc = src->mc; + dst->tlayer_id = src->tlayer_id; + dst->mlayer_id = src->mlayer_id; + dst->xlayer_id = src->xlayer_id; + dst->stream_id = src->stream_id; + dst->display_order_hint = src->display_order_hint; + int num_planes = src->monochrome ? 1 : 3; + for (int p = 0; p < num_planes; p++) { + int h = avm_img_plane_height(src, p); + int w = avm_img_plane_width(src, p); + int bps = (src->fmt & AVM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + const unsigned char *s = src->planes[p]; + unsigned char *d = dst->planes[p]; + for (int row = 0; row < h; row++) { + memcpy(d, s, (size_t)w * bps); + s += src->stride[p]; + d += dst->stride[p]; + } + } + return dst; +} + #if CONFIG_PARAKIT_COLLECT_DATA #include "av2/common/entropy_sideinfo.h" #endif @@ -136,6 +190,12 @@ static const arg_def_t bruoptmodearg = ARG_DEF(NULL, "bru-opt-mode", 0, "Use BRU optimized decode mode"); static const arg_def_t icc_file = ARG_DEF(NULL, "icc", 1, "Output ICC profile file"); +static const arg_def_t xlayercfgarg = ARG_DEF( + NULL, "xlayer-config", 1, + "Multi-xlayer JSON config (provides atlas layout for --atlas-composite)"); +static const arg_def_t atlascompositearg = ARG_DEF( + NULL, "atlas-composite", 0, + "Composite decoded xlayers onto atlas canvas (requires --xlayer-config)"); static const arg_def_t *all_args[] = { &help, &codecarg, &use_yv12, @@ -169,6 +229,8 @@ static const arg_def_t *all_args[] = { &help, &randomaccess, &bruoptmodearg, &icc_file, + &xlayercfgarg, + &atlascompositearg, NULL }; #if CONFIG_LANCZOS_RESAMPLE @@ -643,6 +705,104 @@ static FILE *open_outfile(const char *name) { } } +// Dynamic composite groups derived from LCR layer properties. +// Each unique (layer_type, auxiliary_type, view_type) combination +// produces a separate composite output. Layers within a group must +// share the same chroma format; mixed chroma forces separate outputs. +// Mixed bit depth is handled by promoting to the highest bit depth. +typedef struct CompositeGroup { + int layer_type; // TEXTURE_LAYER, AUX_LAYER, etc. + int auxiliary_type; // only meaningful when layer_type == AUX_LAYER + int view_type; // VIEW_UNSPECIFIED, VIEW_CENTER, VIEW_LEFT, etc. + int num_xlayers; // how many xlayers belong to this group + int xlayer_ids[MAX_NUM_XLAYERS]; // xlayer_ids in this group + int xlayer_indices[MAX_NUM_XLAYERS]; // indices into xlayer_cfg.xlayers[] + // (-1 if from decoder query) + avm_image_t *canvas; + FILE *outfile_cg; + int layers_placed; // reset each frame + int frame_count; + int mixed_chroma; // 1 if layers have different chroma formats + unsigned int max_bit_depth; // highest bit depth among layers in group + char label[128]; // human-readable label for stderr +} CompositeGroup; + +static const char *comp_layer_type_names[] = { "texture", "auxiliary", "stereo", + "dependent" }; +static const char *comp_aux_type_names[] = { "alpha", "depth", "segmentation", + "gain_map" }; +static const char *comp_view_type_names[] = { "unspecified", "center", "left", + "right", "explicit" }; + +// Build composite groups from arrays of per-xlayer properties. +// Allocates comp_groups and fills *out_groups / *out_num_groups. +// xlayer_ids[], layer_types[], aux_types[], view_types[] are parallel arrays +// of length num_xlayers. config_indices[] provides the JSON config index for +// each xlayer (-1 if built from decoder query). +static void build_composite_groups(int num_xlayers, const int *xlayer_ids, + const int *layer_types, const int *aux_types, + const int *view_types, + const int *config_indices, + CompositeGroup **out_groups, + int *out_num_groups) { + CompositeGroup *groups = + (CompositeGroup *)calloc(num_xlayers, sizeof(CompositeGroup)); + int num_groups = 0; + + for (int i = 0; i < num_xlayers; i++) { + int lt = layer_types[i]; + int at = aux_types[i]; + int vt = view_types[i]; + // Find existing group or create new + int gidx = -1; + for (int g = 0; g < num_groups; g++) { + if (groups[g].layer_type == lt && groups[g].auxiliary_type == at && + groups[g].view_type == vt) { + gidx = g; + break; + } + } + if (gidx < 0) { + gidx = num_groups++; + groups[gidx].layer_type = lt; + groups[gidx].auxiliary_type = at; + groups[gidx].view_type = vt; + groups[gidx].num_xlayers = 0; + groups[gidx].canvas = NULL; + groups[gidx].outfile_cg = NULL; + groups[gidx].layers_placed = 0; + groups[gidx].frame_count = 0; + } + int k = groups[gidx].num_xlayers++; + groups[gidx].xlayer_ids[k] = xlayer_ids[i]; + groups[gidx].xlayer_indices[k] = config_indices ? config_indices[i] : -1; + } + + // Build labels and report + fprintf(stderr, "Atlas composite: %d output group(s)\n", num_groups); + for (int g = 0; g < num_groups; g++) { + CompositeGroup *cg = &groups[g]; + const char *lt_name = (cg->layer_type >= 0 && cg->layer_type < 4) + ? comp_layer_type_names[cg->layer_type] + : "unknown"; + const char *vt_name = (cg->view_type >= 0 && cg->view_type < 5) + ? comp_view_type_names[cg->view_type] + : "unknown"; + if (cg->layer_type == AUX_LAYER && cg->auxiliary_type >= 0 && + cg->auxiliary_type < 4) { + snprintf(cg->label, sizeof(cg->label), "%s_%s_%s", + comp_aux_type_names[cg->auxiliary_type], lt_name, vt_name); + } else { + snprintf(cg->label, sizeof(cg->label), "%s_%s", lt_name, vt_name); + } + fprintf(stderr, " group %d [%s]: %d xlayer(s)\n", g, cg->label, + cg->num_xlayers); + } + + *out_groups = groups; + *out_num_groups = num_groups; +} + static int main_loop(int argc, const char **argv_) { avm_codec_ctx_t decoder; char *fn = NULL; @@ -679,6 +839,15 @@ static int main_loop(int argc, const char **argv_) { int num_local_ops_selections = 0; int output_all_layers = 0; int skip_film_grain = 0; + int atlas_composite = 0; + char xlayer_config_path[PATH_MAX] = { 0 }; + MultiXLayerConfig xlayer_cfg; + + CompositeGroup *comp_groups = NULL; + int num_comp_groups = 0; + int comp_groups_built = 0; + avm_atlas_info_t dec_atlas_info; + memset(&dec_atlas_info, 0, sizeof(dec_atlas_info)); int random_access_point_index = 0; int bru_opt_mode = 0; avm_image_t *scaled_img = NULL; @@ -686,6 +855,12 @@ static int main_loop(int argc, const char **argv_) { int frame_avail, got_data, flush_decoder = 0; int num_external_frame_buffers = 0; struct ExternalFrameBufferList ext_fb_list = { 0, NULL }; + int is_monotonic_output = -1; // -1 = unknown, 0/1 from bitstream + + // Flush reordering buffer for interleaved single-file output + FlushFrame *flush_buf = NULL; + int flush_buf_count = 0; + int flush_buf_capacity = 0; const char *outfile_pattern = NULL; char outfile_name[PATH_MAX] = { 0 }; @@ -709,6 +884,7 @@ static int main_loop(int argc, const char **argv_) { FILE *outfile_substream[AVM_MAX_NUM_STREAMS] = { NULL }; int substream_frame_out[AVM_MAX_NUM_STREAMS] = { 0 }; + int total_decode_errors = 0; FILE *framestats_file = NULL; FILE *icc_f = NULL; @@ -876,6 +1052,10 @@ static int main_loop(int argc, const char **argv_) { bru_opt_mode = 1; } else if (arg_match(&arg, &icc_file, argi)) { icc_f = fopen(arg.val, "wb"); + } else if (arg_match(&arg, &xlayercfgarg, argi)) { + snprintf(xlayer_config_path, PATH_MAX, "%s", arg.val); + } else if (arg_match(&arg, &atlascompositearg, argi)) { + atlas_composite = 1; } else { argj++; } @@ -889,6 +1069,38 @@ static int main_loop(int argc, const char **argv_) { /* Handle non-option arguments */ fn = argv[0]; + // Atlas composite setup + xlayer_config_init(&xlayer_cfg); + if (atlas_composite) { + output_all_layers = 1; // implicitly enable all-layers output + } + // Default to keep-going mode for multi-xlayer decoding + if (output_all_layers && !keep_going) { + keep_going = 1; + } + if (xlayer_config_path[0] != '\0') { + if (parse_multi_xlayer_config(xlayer_config_path, &xlayer_cfg) != 0) { + die("Error: failed to parse xlayer config \"%s\"\n", xlayer_config_path); + } + // Build composite groups eagerly from JSON config + if (atlas_composite && xlayer_cfg.enable_atlas) { + int xlids[MAX_NUM_XLAYERS], lts[MAX_NUM_XLAYERS]; + int ats[MAX_NUM_XLAYERS], vts[MAX_NUM_XLAYERS]; + int idxs[MAX_NUM_XLAYERS]; + for (int xi = 0; xi < xlayer_cfg.num_xlayers; xi++) { + const XLayerEncConfig *xl = &xlayer_cfg.xlayers[xi]; + xlids[xi] = xl->xlayer_id; + lts[xi] = xl->layer_type; + ats[xi] = (xl->layer_type == AUX_LAYER) ? xl->auxiliary_type : -1; + vts[xi] = xl->view_type; + idxs[xi] = xi; + } + build_composite_groups(xlayer_cfg.num_xlayers, xlids, lts, ats, vts, idxs, + &comp_groups, &num_comp_groups); + comp_groups_built = 1; + } + } + if (!fn) { free(argv); fprintf(stderr, "No input file specified!\n"); @@ -955,6 +1167,28 @@ static int main_loop(int argc, const char **argv_) { outfile = open_outfile(outfile_name); } } + // Open per-group output files for atlas composite (JSON path only; + // decoder-query path opens files in the deferred block) + if (atlas_composite && comp_groups_built && num_comp_groups > 1) { + for (int g = 0; g < num_comp_groups; g++) { + char group_name[PATH_MAX + 128] = { 0 }; + // Insert group label before extension + const char *dot = strrchr(outfile_name, '.'); + if (dot) { + size_t base_len = (size_t)(dot - outfile_name); + snprintf(group_name, sizeof(group_name), "%.*s_%s%s", (int)base_len, + outfile_name, comp_groups[g].label, dot); + } else { + snprintf(group_name, sizeof(group_name), "%s_%s", outfile_name, + comp_groups[g].label); + } + comp_groups[g].outfile_cg = open_outfile(group_name); + fprintf(stderr, " group %d output: %s\n", g, group_name); + } + } else if (atlas_composite && comp_groups_built && num_comp_groups == 1) { + // Single group: reuse the main outfile + comp_groups[0].outfile_cg = outfile; + } } if (use_y4m && !noblit) { @@ -1102,6 +1336,7 @@ static int main_loop(int argc, const char **argv_) { avm_codec_error(&decoder)); if (detail) warn("Additional information: %s", detail); + total_decode_errors++; if (!keep_going) goto fail; } @@ -1136,6 +1371,68 @@ static int main_loop(int argc, const char **argv_) { dx_time += avm_usec_timer_elapsed(&timer); got_data = 0; + + // Deferred composite group building from decoder LCR/Atlas info + if (atlas_composite && !comp_groups_built) { + avm_lcr_info_t lcr_info; + memset(&lcr_info, 0, sizeof(lcr_info)); + memset(&dec_atlas_info, 0, sizeof(dec_atlas_info)); + + int have_lcr = !AVM_CODEC_CONTROL_TYPECHECKED(&decoder, AV2D_GET_LCR_INFO, + &lcr_info); + int have_atlas = !AVM_CODEC_CONTROL_TYPECHECKED( + &decoder, AV2D_GET_ATLAS_INFO, &dec_atlas_info); + + if (have_lcr && lcr_info.num_xlayers > 0) { + int xlids[31], lts[31], ats_arr[31], vts[31]; + for (int li = 0; li < lcr_info.num_xlayers; li++) { + xlids[li] = lcr_info.xlayers[li].xlayer_id; + lts[li] = lcr_info.xlayers[li].layer_type; + ats_arr[li] = lcr_info.xlayers[li].auxiliary_type; + vts[li] = lcr_info.xlayers[li].view_type; + } + build_composite_groups(lcr_info.num_xlayers, xlids, lts, ats_arr, vts, + NULL, &comp_groups, &num_comp_groups); + comp_groups_built = 1; + + if (have_atlas && dec_atlas_info.num_segments > 0) { + fprintf(stderr, + "Atlas info from bitstream: %dx%d canvas, %d segment(s)\n", + dec_atlas_info.atlas_width, dec_atlas_info.atlas_height, + dec_atlas_info.num_segments); + } + + // Open per-group output files + if (!noblit && single_file && outfile_pattern) { + if (num_comp_groups > 1) { + for (int g = 0; g < num_comp_groups; g++) { + char group_name[PATH_MAX + 128] = { 0 }; + const char *dot = strrchr(outfile_name, '.'); + if (dot) { + size_t base_len = (size_t)(dot - outfile_name); + snprintf(group_name, sizeof(group_name), "%.*s_%s%s", + (int)base_len, outfile_name, comp_groups[g].label, + dot); + } else { + snprintf(group_name, sizeof(group_name), "%s_%s", outfile_name, + comp_groups[g].label); + } + comp_groups[g].outfile_cg = open_outfile(group_name); + fprintf(stderr, " group %d output: %s\n", g, group_name); + } + } else if (num_comp_groups == 1) { + comp_groups[0].outfile_cg = outfile; + } + } + } else { + // No LCR info available — atlas composite not possible + fprintf(stderr, + "Warning: no LCR info in bitstream, atlas composite disabled. " + "Falling back to per-layer output.\n"); + atlas_composite = 0; + } + } + while ((img = avm_codec_get_frame(&decoder, &iter))) { // frame_out does not include hidden frames. ++frame_out; @@ -1144,6 +1441,17 @@ static int main_loop(int argc, const char **argv_) { } if (!flush_decoder) got_data = 1; + // Query monotonic_output_order_flag lazily on first output frame + if (is_monotonic_output < 0) { + unsigned int mono_flag = 0; + if (!AVM_CODEC_CONTROL_TYPECHECKED( + &decoder, AV2D_GET_MONOTONIC_OUTPUT_ORDER, &mono_flag)) { + is_monotonic_output = (int)mono_flag; + } else { + is_monotonic_output = 1; // assume monotonic if unknown + } + } + if (AVM_CODEC_CONTROL_TYPECHECKED(&decoder, AVMD_GET_FRAME_CORRUPTED, &corrupted)) { warn("Failed AVM_GET_FRAME_CORRUPTED: %s", avm_codec_error(&decoder)); @@ -1181,6 +1489,235 @@ static int main_loop(int argc, const char **argv_) { const int PLANES_YVU[] = { AVM_PLANE_Y, AVM_PLANE_V, AVM_PLANE_U }; const int *planes = flipuv ? PLANES_YVU : PLANES_YUV; + // Buffer frames for interleaved single-file output so they can + // be sorted by display order before writing. Non-monotonic + // output from the decoder can interleave xlayers out of display + // order even during normal decode (not just flush). + if (!is_monotonic_output && output_all_layers && num_streams == 1 && + single_file && !do_md5 && !atlas_composite) { + if (flush_buf_count >= flush_buf_capacity) { + int new_cap = flush_buf_capacity ? flush_buf_capacity * 2 : 64; + FlushFrame *new_buf = (FlushFrame *)realloc( + flush_buf, (size_t)new_cap * sizeof(FlushFrame)); + if (!new_buf) { + warn("Failed to allocate flush reorder buffer"); + goto fail; + } + flush_buf = new_buf; + flush_buf_capacity = new_cap; + } + FlushFrame *ff = &flush_buf[flush_buf_count]; + ff->img = deep_copy_image(img); + if (!ff->img) { + warn("Failed to copy flush frame"); + goto fail; + } + ff->order_hint = img->display_order_hint; + ff->xlayer_id = img->xlayer_id; + ff->mlayer_id = img->mlayer_id; + flush_buf_count++; + continue; + } + + // Atlas composite mode: place decoded xlayer into its group's canvas + if (atlas_composite && comp_groups_built) { + int xlid = img->xlayer_id; + + // Find this xlayer's composite group by xlayer_id + int gidx = -1; + for (int g = 0; g < num_comp_groups; g++) { + for (int k = 0; k < comp_groups[g].num_xlayers; k++) { + if (comp_groups[g].xlayer_ids[k] == xlid) { + gidx = g; + break; + } + } + if (gidx >= 0) break; + } + if (gidx < 0) { + fprintf(stderr, + "Warning: decoded xlayer_id %d not in any composite group, " + "skipping\n", + xlid); + continue; + } + + CompositeGroup *cg = &comp_groups[gidx]; + + // Allocate this group's canvas on first use + if (!cg->canvas) { + unsigned int cw = img->d_w; + unsigned int ch = img->d_h; + // Prefer atlas info from decoder, then JSON config + if (dec_atlas_info.atlas_width > 0 && + dec_atlas_info.atlas_height > 0) { + cw = (unsigned int)dec_atlas_info.atlas_width; + ch = (unsigned int)dec_atlas_info.atlas_height; + } else if (xlayer_cfg.atlas_width > 0 && + xlayer_cfg.atlas_height > 0) { + cw = (unsigned int)xlayer_cfg.atlas_width; + ch = (unsigned int)xlayer_cfg.atlas_height; + } + cg->max_bit_depth = img->bit_depth; + cg->canvas = avm_img_alloc(NULL, img->fmt, cw, ch, 32); + if (!cg->canvas) { + die("Error: failed to allocate composite canvas %ux%u for " + "group %d [%s]\n", + cw, ch, gidx, cg->label); + } + cg->canvas->bit_depth = img->bit_depth; + cg->canvas->monochrome = img->monochrome; + cg->canvas->csp = img->csp; + cg->canvas->range = img->range; + for (int p = 0; p < 3; p++) { + unsigned int ph = avm_img_plane_height(cg->canvas, p); + memset(cg->canvas->planes[p], 0, + (size_t)cg->canvas->stride[p] * ph); + } + } + + // Check chroma format compatibility + if (img->x_chroma_shift != cg->canvas->x_chroma_shift || + img->y_chroma_shift != cg->canvas->y_chroma_shift) { + if (!cg->mixed_chroma) { + cg->mixed_chroma = 1; + fprintf(stderr, + "Warning: group %d [%s] has mixed chroma formats — " + "compositing disabled for this group. Use per-layer " + "output (--all-layers --num-streams) instead.\n", + gidx, cg->label); + } + // Fall through to normal output path (don't continue) + } else { + // Handle bit-depth mismatch: promote to highest + unsigned int canvas_bd = cg->canvas->bit_depth; + unsigned int frame_bd = img->bit_depth; + if (frame_bd > cg->max_bit_depth) cg->max_bit_depth = frame_bd; + + // Get atlas position for this xlayer. + // Try decoder atlas info first, then JSON config fallback. + int pos_x = 0, pos_y = 0; + int found_pos = 0; + if (dec_atlas_info.num_segments > 0) { + for (int s = 0; s < dec_atlas_info.num_segments; s++) { + if (dec_atlas_info.segments[s].xlayer_id == xlid) { + pos_x = dec_atlas_info.segments[s].pos_x; + pos_y = dec_atlas_info.segments[s].pos_y; + found_pos = 1; + break; + } + } + } + if (!found_pos && xlayer_config_path[0] != '\0') { + for (int xi = 0; xi < xlayer_cfg.num_xlayers; xi++) { + if (xlayer_cfg.xlayers[xi].xlayer_id == xlid) { + pos_x = xlayer_cfg.xlayers[xi].atlas_pos_x >= 0 + ? xlayer_cfg.xlayers[xi].atlas_pos_x + : 0; + pos_y = xlayer_cfg.xlayers[xi].atlas_pos_y >= 0 + ? xlayer_cfg.xlayers[xi].atlas_pos_y + : 0; + break; + } + } + } + int canvas_bps = + (cg->canvas->fmt & AVM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + int frame_bps = (img->fmt & AVM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + int shift = (int)canvas_bd - (int)frame_bd; + + for (int p = 0; p < 3; p++) { + int px = pos_x, py = pos_y; + unsigned int pw = img->d_w, ph = img->d_h; + if (p > 0) { + px >>= (int)img->x_chroma_shift; + py >>= (int)img->y_chroma_shift; + pw >>= img->x_chroma_shift; + ph >>= img->y_chroma_shift; + } + const unsigned char *src_row = img->planes[p]; + unsigned char *dst_row = cg->canvas->planes[p] + + py * cg->canvas->stride[p] + + px * canvas_bps; + + if (shift == 0 && canvas_bps == frame_bps) { + // Same bit depth: direct memcpy + unsigned int row_bytes = pw * (unsigned int)canvas_bps; + for (unsigned int row = 0; row < ph; row++) { + memcpy(dst_row, src_row, row_bytes); + src_row += img->stride[p]; + dst_row += cg->canvas->stride[p]; + } + } else if (canvas_bps == 2 && frame_bps == 2 && shift > 0) { + // Both 16-bit, canvas higher: shift up + for (unsigned int row = 0; row < ph; row++) { + const uint16_t *s = (const uint16_t *)src_row; + uint16_t *d = (uint16_t *)dst_row; + for (unsigned int col = 0; col < pw; col++) + d[col] = (uint16_t)(s[col] << shift); + src_row += img->stride[p]; + dst_row += cg->canvas->stride[p]; + } + } else if (canvas_bps == 2 && frame_bps == 1) { + // 8-bit frame into 16-bit canvas + int total_shift = (int)canvas_bd - 8; + for (unsigned int row = 0; row < ph; row++) { + uint16_t *d = (uint16_t *)dst_row; + for (unsigned int col = 0; col < pw; col++) + d[col] = + (uint16_t)((unsigned int)src_row[col] << total_shift); + src_row += img->stride[p]; + dst_row += cg->canvas->stride[p]; + } + } else { + // Fallback: direct copy (same bps, shift <= 0 = truncate) + unsigned int row_bytes = pw * (unsigned int)frame_bps; + if ((unsigned int)canvas_bps < (unsigned int)frame_bps) + row_bytes = pw * (unsigned int)canvas_bps; + for (unsigned int row = 0; row < ph; row++) { + memcpy(dst_row, src_row, row_bytes); + src_row += img->stride[p]; + dst_row += cg->canvas->stride[p]; + } + } + } + + // Output composite when all xlayers for this group are placed + cg->layers_placed++; + if (cg->layers_placed >= cg->num_xlayers) { + cg->layers_placed = 0; + cg->frame_count++; + FILE *cg_out = cg->outfile_cg; + if (cg_out && single_file) { + avm_image_t *cimg = cg->canvas; + int num_planes_out = (opt_raw && cimg->monochrome) ? 1 : 3; + if (use_y4m) { + char y4m_buf[Y4M_BUFFER_SIZE] = { 0 }; + if (cg->frame_count == 1) { + y4m_write_file_header( + y4m_buf, sizeof(y4m_buf), cimg->d_w, cimg->d_h, + &avm_input_ctx.framerate, cimg->monochrome, cimg->csp, + cimg->fmt, cimg->bit_depth, cimg->range); + fputs(y4m_buf, cg_out); + } + y4m_write_frame_header(y4m_buf, sizeof(y4m_buf)); + fputs(y4m_buf, cg_out); + y4m_write_image_file(cimg, planes, cg_out); + } else { + raw_write_image_file(cimg, planes, num_planes_out, cg_out); + } + } + // Zero-fill canvas for next frame + for (int p = 0; p < 3; p++) { + unsigned int ph = avm_img_plane_height(cg->canvas, p); + memset(cg->canvas->planes[p], 0, + (size_t)cg->canvas->stride[p] * ph); + } + } + continue; // skip normal output path + } + } + if (do_scale) { if (frame_out == 1) { // If the output frames are to be scaled to a fixed display size @@ -1375,11 +1912,80 @@ static int main_loop(int argc, const char **argv_) { } } + // Write buffered frames in display order for interleaved output + if (flush_buf_count > 0) { + qsort(flush_buf, (size_t)flush_buf_count, sizeof(FlushFrame), + compare_flush_frames); + const int PLANES_YUV[] = { AVM_PLANE_Y, AVM_PLANE_U, AVM_PLANE_V }; + const int PLANES_YVU[] = { AVM_PLANE_Y, AVM_PLANE_V, AVM_PLANE_U }; + const int *planes = flipuv ? PLANES_YVU : PLANES_YUV; + for (int fi = 0; fi < flush_buf_count; fi++) { + avm_image_t *fimg = flush_buf[fi].img; + unsigned int output_bit_depth; + if (!fixed_output_bit_depth && single_file) { + output_bit_depth = fimg->bit_depth; + } else { + output_bit_depth = fixed_output_bit_depth; + } + if (output_bit_depth != 0) + avm_shift_img(output_bit_depth, &fimg, &img_shifted); + + if (use_y4m) { + char y4m_buf[Y4M_BUFFER_SIZE] = { 0 }; + if (fi == 0) { + // Write y4m file header for the first sorted frame + y4m_write_file_header(y4m_buf, sizeof(y4m_buf), fimg->d_w, fimg->d_h, + &avm_input_ctx.framerate, fimg->monochrome, + fimg->csp, fimg->fmt, fimg->bit_depth, + fimg->range); + fputs(y4m_buf, outfile); + } + y4m_write_frame_header(y4m_buf, sizeof(y4m_buf)); + fputs(y4m_buf, outfile); + y4m_write_image_file(fimg, planes, outfile); + } else { + int num_planes = (opt_raw && fimg->monochrome) ? 1 : 3; + raw_write_image_file(fimg, planes, num_planes, outfile); + } + avm_img_free(flush_buf[fi].img); + } + // frame_out was already incremented in the main loop for each + // buffered frame, so don't add flush_buf_count again. + free(flush_buf); + flush_buf = NULL; + flush_buf_count = 0; + } + if (summary || progress) { show_progress(frame_in, frame_out, dx_time); fprintf(stderr, "\n"); } + // Output summary report + if (!noblit && outfile_pattern && strcmp(outfile_pattern, "-") != 0) { + fprintf(stderr, "\nDecode complete:\n"); + if (atlas_composite && comp_groups_built) { + for (int g = 0; g < num_comp_groups; g++) { + fprintf(stderr, " Output: %s (%d frames)\n", comp_groups[g].label, + comp_groups[g].frame_count); + } + } else if (num_streams > 1) { + for (int sub = 0; sub < num_streams; sub++) { + char outfile_substream_name[PATH_MAX] = { 0 }; + add_postfix_stream_id(outfile_name, outfile_substream_name, sub); + fprintf(stderr, " Output: %s (%d frames)\n", outfile_substream_name, + substream_frame_out[sub]); + } + } else { + fprintf(stderr, " Output: %s (%d frames)\n", outfile_name, frame_out); + } + if (total_decode_errors > 0) { + fprintf(stderr, " Errors: %d\n", total_decode_errors); + } else { + fprintf(stderr, " Errors: 0\n"); + } + } + if (frames_corrupted) { fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted); } else { @@ -1388,6 +1994,14 @@ static int main_loop(int argc, const char **argv_) { fail: + // Clean up flush buffer if we exited early + if (flush_buf) { + for (int fi = 0; fi < flush_buf_count; fi++) { + if (flush_buf[fi].img) avm_img_free(flush_buf[fi].img); + } + free(flush_buf); + } + if (avm_codec_destroy(&decoder)) { fprintf(stderr, "Failed to destroy decoder: %s\n", avm_codec_error(&decoder)); @@ -1433,6 +2047,15 @@ static int main_loop(int argc, const char **argv_) { if (scaled_img) avm_img_free(scaled_img); if (img_shifted) avm_img_free(img_shifted); + if (comp_groups) { + for (int g = 0; g < num_comp_groups; g++) { + if (comp_groups[g].canvas) avm_img_free(comp_groups[g].canvas); + // Close per-group files (but not if it's the shared main outfile) + if (comp_groups[g].outfile_cg && comp_groups[g].outfile_cg != outfile) + fclose(comp_groups[g].outfile_cg); + } + free(comp_groups); + } for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) { free(ext_fb_list.ext_fb[i].data); diff --git a/apps/avmenc.c b/apps/avmenc.c index 6375df5d8a..257dba557a 100644 --- a/apps/avmenc.c +++ b/apps/avmenc.c @@ -40,6 +40,8 @@ #include "common/tools_common.h" #include "common/warnings.h" #include "av2/common/blockd.h" +#include "common/xlayer_config_parse.h" +#include "apps/avmenc_xlayer.h" #if CONFIG_WEBM_IO #include "common/webmenc.h" @@ -136,6 +138,7 @@ const arg_def_t *main_args[] = { &g_av2_codec_arg_defs.help, &g_av2_codec_arg_defs.debugmode, &g_av2_codec_arg_defs.outputfile, &g_av2_codec_arg_defs.reconfile, + &g_av2_codec_arg_defs.xlayer_config, &g_av2_codec_arg_defs.codecarg, &g_av2_codec_arg_defs.passes, &g_av2_codec_arg_defs.pass_arg, @@ -737,6 +740,8 @@ static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) { global->disable_warning_prompt = 1; } else if (arg_match(&arg, &g_av2_codec_arg_defs.icc_file, argi)) { read_icc_profile(global, arg.val); + } else if (arg_match(&arg, &g_av2_codec_arg_defs.xlayer_config, argi)) { + global->xlayer_config_path = arg.val; } else { argj++; } @@ -1624,12 +1629,7 @@ static void setup_pass(struct stream_state *stream, static void initialize_encoder(struct stream_state *stream, struct AvxEncoderConfig *global) { int i; - int flags = 0; - - flags |= (global->show_psnr >= 1) ? AVM_CODEC_USE_PSNR : 0; - flags |= (global->show_psnr == 2) ? AVM_CODEC_USE_STREAM_PSNR : 0; - flags |= global->quiet ? 0 : AVM_CODEC_USE_PER_FRAME_STATS; - flags |= global->verbose ? AVM_CODEC_USE_PER_FRAME_HLS_INFO : 0; + int flags = avx_encoder_init_flags(global); /* Construct Encoder Context */ avm_codec_enc_init(&stream->encoder, global->codec, &stream->config.cfg, @@ -1907,40 +1907,46 @@ static float usec_to_fps(uint64_t usec, unsigned int frames) { } static void write_recon_file(struct stream_state *stream, FILE *file) { - avm_image_t enc_img; - - AVM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV2_GET_NEW_FRAME_IMAGE, - &enc_img); + const avm_image_t *enc_img = avm_codec_get_preview_frame(&stream->encoder); - ctx_exit_on_error(&stream->encoder, - "Failed to get encoder reconstructed frame"); + if (!enc_img) { + ctx_exit_on_error(&stream->encoder, + "Failed to get encoder reconstructed frame"); + return; + } - int num_planes = enc_img.monochrome ? 1 : 3; + int num_planes = enc_img->monochrome ? 1 : 3; const int PLANES_YUV[] = { AVM_PLANE_Y, AVM_PLANE_U, AVM_PLANE_V }; const int *planes = PLANES_YUV; - raw_write_image_file(&enc_img, planes, num_planes, file); + raw_write_image_file(enc_img, planes, num_planes, file); } static void test_decode(struct stream_state *stream, enum TestDecodeFatality fatal) { - avm_image_t enc_img, dec_img; + avm_image_t dec_img; - // fprintf(stderr, "DEBUG: Running test_decode at POC: %d\n", - // stream->frames_out - 1); if (stream->mismatch_seen) return; - /* Get the internal reference frame */ - AVM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV2_GET_NEW_FRAME_IMAGE, - &enc_img); + /* Get the internal reference frame from the encoder via preview API. + * AV2_GET_NEW_FRAME_IMAGE relies on last_show_frame_buf which is only set + * for immediate-output frames, so it fails for hidden frames encoded with + * SEF mode. The preview API accesses cm->cur_frame directly and always + * works. */ + const avm_image_t *enc_img = avm_codec_get_preview_frame(&stream->encoder); + AVM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV2_GET_NEW_FRAME_IMAGE, &dec_img); - ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame"); + if (!enc_img) { + ctx_exit_on_error(&stream->encoder, + "Failed to get encoder reference frame"); + return; + } ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame"); - if (!avm_compare_img(&enc_img, &dec_img)) { + if (!avm_compare_img(enc_img, &dec_img)) { int y[4], u[4], v[4]; - avm_find_mismatch_high(&enc_img, &dec_img, y, u, v); + avm_find_mismatch_high(enc_img, &dec_img, y, u, v); stream->decoder.err = 1; warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL, "Stream %d: Encode/decode mismatch on POC %d at" @@ -1953,7 +1959,6 @@ static void test_decode(struct stream_state *stream, stream->mismatch_seen = stream->frames_out; } - avm_img_free(&enc_img); avm_img_free(&dec_img); } @@ -1991,6 +1996,35 @@ int main(int argc, const char **argv_) { if (argc < 2) usage_exit(); + // Multi-xlayer encoding: dispatch to separate path if config is provided + if (global.xlayer_config_path != NULL) { + // Warn about unconsumed CLI args that will be ignored in xlayer mode + for (argi = argv; *argi; argi++) { + if (argi[0][0] == '-' && argi[0][1]) + warn( + "option \"%s\" ignored in xlayer mode " + "(use JSON config instead)", + *argi); + } + + MultiXLayerConfig mcfg; + if (parse_multi_xlayer_config(global.xlayer_config_path, &mcfg) != 0) { + die("Error: failed to parse xlayer config \"%s\"\n", + global.xlayer_config_path); + } + if (resolve_input_sources(&mcfg) != 0) { + die("Error: failed to resolve input sources in xlayer config \"%s\"\n", + global.xlayer_config_path); + } + resolve_mlayer_ci(&mcfg); + if (validate_multi_xlayer_config(&mcfg) != 0) { + die("Error: invalid xlayer config \"%s\"\n", global.xlayer_config_path); + } + res = encode_multi_xlayer(&mcfg, &global); + free(argv); + return res; + } + switch (global.color_type) { case I420: input.fmt = AVM_IMG_FMT_I420; break; case I422: input.fmt = AVM_IMG_FMT_I422; break; diff --git a/apps/avmenc.h b/apps/avmenc.h index 17c9f94613..1ca93f9ede 100644 --- a/apps/avmenc.h +++ b/apps/avmenc.h @@ -52,8 +52,20 @@ struct AvxEncoderConfig { int experimental_bitstream; avm_chroma_sample_position_t csp; cfg_options_t encoder_config; + const char *xlayer_config_path; // Path to multi-xlayer JSON config }; +// Compute encoder init flags from global config (used by both single-stream +// and multi-xlayer paths). +static inline int avx_encoder_init_flags(const struct AvxEncoderConfig *cfg) { + int flags = 0; + flags |= (cfg->show_psnr >= 1) ? AVM_CODEC_USE_PSNR : 0; + flags |= (cfg->show_psnr == 2) ? AVM_CODEC_USE_STREAM_PSNR : 0; + flags |= cfg->quiet ? 0 : AVM_CODEC_USE_PER_FRAME_STATS; + flags |= cfg->verbose ? AVM_CODEC_USE_PER_FRAME_HLS_INFO : 0; + return flags; +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/apps/avmenc_xlayer.c b/apps/avmenc_xlayer.c new file mode 100644 index 0000000000..9c373e848e --- /dev/null +++ b/apps/avmenc_xlayer.c @@ -0,0 +1,1163 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#include "apps/avmenc_xlayer.h" + +#include +#include +#include +#include + +#include "avm/avm_encoder.h" +#include "avm/avm_integer.h" +#include "avm/avmcx.h" +#include "avm_ports/avm_timer.h" +#include "common/tools_common.h" +#include "common/y4minput.h" + +// Shared source reader for subpicture encoding from a single input +typedef struct SharedSourceReader { + struct AvxInputContext input; + avm_image_t raw; // full-resolution frame + int initialized; + int eof; +} SharedSourceReader; + +// Open a file and detect its type (Y4M or raw YUV). +// On success, populates input->file, file_type, and (for Y4M) dimensions, +// framerate, format, bit_depth, and color_range. Returns 0 on success, -1 +// on error. +static int open_and_detect_input(struct AvxInputContext *input, + const char *filename) { + input->file = fopen(filename, "rb"); + if (!input->file) { + fprintf(stderr, "Error: cannot open input file \"%s\"\n", filename); + return -1; + } + + struct FileTypeDetectionBuffer *detect = &input->detect; + detect->buf_read = (int)fread(detect->buf, 1, 4, input->file); + detect->position = 0; + + if (detect->buf_read >= 4 && memcmp(detect->buf, "YUV4", 4) == 0) { + input->file_type = FILE_TYPE_Y4M; + y4m_input_open(&input->y4m, input->file, (char *)detect->buf, 4, + AVM_CSP_UNSPECIFIED, 0); + input->width = input->y4m.pic_w; + input->height = input->y4m.pic_h; + input->framerate.numerator = input->y4m.fps_n; + input->framerate.denominator = input->y4m.fps_d; + input->fmt = input->y4m.avm_fmt; + input->bit_depth = input->y4m.bit_depth; + input->color_range = input->y4m.color_range; + } else { + input->file_type = FILE_TYPE_RAW; + fseek(input->file, 0, SEEK_SET); + // Reset detect buffer so read_yuv_frame doesn't replay detection bytes + detect->buf_read = 0; + detect->position = 0; + } + return 0; +} + +static int shared_source_init(SharedSourceReader *src, + const InputSourceConfig *inp, + const MultiXLayerConfig *mcfg) { + memset(src, 0, sizeof(*src)); + if (inp->filename[0] == '\0') return 0; + + src->input.filename = inp->filename; + src->input.framerate.numerator = 30; + src->input.framerate.denominator = 1; + src->input.only_i420 = 0; + src->input.bit_depth = 0; + + if (open_and_detect_input(&src->input, inp->filename) != 0) return -1; + + if (src->input.file_type == FILE_TYPE_RAW) { + // Use config-specified dimensions for raw input + src->input.width = inp->width; + src->input.height = inp->height; + src->input.fmt = AVM_IMG_FMT_I420; + } + + // Override dimensions from config if specified + if (inp->width > 0) src->input.width = inp->width; + if (inp->height > 0) src->input.height = inp->height; + + // Apply explicit format/bit_depth (overrides Y4M detection too) + if (inp->format == 422) + src->input.fmt = AVM_IMG_FMT_I422; + else if (inp->format == 444) + src->input.fmt = AVM_IMG_FMT_I444; + else if (inp->format == 420) + src->input.fmt = AVM_IMG_FMT_I420; + + if (inp->bit_depth > 0) src->input.bit_depth = inp->bit_depth; + + // If format still unknown, derive from the first xlayer using this source + if (src->input.fmt == 0) { + // Find first xlayer referencing this input source + int src_idx = (int)(inp - mcfg->input_sources); + for (int i = 0; i < mcfg->num_xlayers; i++) { + if (mcfg->xlayers[i].input_source_idx == src_idx) { + switch (mcfg->xlayers[i].profile) { + case MAIN_422_10_IP1: src->input.fmt = AVM_IMG_FMT_I422; break; + case MAIN_444_10_IP1: src->input.fmt = AVM_IMG_FMT_I444; break; + default: src->input.fmt = AVM_IMG_FMT_I420; break; + } + break; + } + } + if (src->input.fmt == 0) src->input.fmt = AVM_IMG_FMT_I420; + } + + // Allocate full-resolution raw frame + if (src->input.file_type != FILE_TYPE_Y4M) { + if (!avm_img_alloc(&src->raw, src->input.fmt, src->input.width, + src->input.height, 32)) { + fprintf(stderr, "Error: failed to allocate shared source image\n"); + return -1; + } + } + + src->initialized = 1; + fprintf(stderr, "Input source \"%s\": %ux%u, \"%s\"\n", inp->name, + src->input.width, src->input.height, inp->filename); + return 0; +} + +// Read one full-resolution frame from shared source. Returns 1 if available. +static int shared_source_read_frame(SharedSourceReader *src) { + if (!src->initialized || src->eof) return 0; + + int frame_avail; + if (src->input.file_type == FILE_TYPE_Y4M) { + frame_avail = (y4m_input_fetch_frame(&src->input.y4m, src->input.file, + &src->raw) >= 1); + } else { + frame_avail = (read_yuv_frame(&src->input, &src->raw) == 0); + } + + if (!frame_avail) { + src->eof = 1; + return 0; + } + return 1; +} + +// Crop a region from the shared source into an xlayer's raw buffer. +// Copies the rectangle at (pos_x, pos_y) with size (crop_w, crop_h) +// from src_img into dst_img. +static void crop_region_to_xlayer(avm_image_t *dst_img, + const avm_image_t *src_img, int pos_x, + int pos_y, unsigned int crop_w, + unsigned int crop_h) { + for (int plane = 0; plane < 3; plane++) { + int sx = pos_x; + int sy = pos_y; + unsigned int cw = crop_w; + unsigned int ch = crop_h; + int bytes_per_sample = 1; + + if (src_img->fmt & AVM_IMG_FMT_HIGHBITDEPTH) bytes_per_sample = 2; + + if (plane > 0) { + sx >>= (int)src_img->x_chroma_shift; + sy >>= (int)src_img->y_chroma_shift; + cw >>= src_img->x_chroma_shift; + ch >>= src_img->y_chroma_shift; + } + + const unsigned char *src_row = src_img->planes[plane] + + sy * src_img->stride[plane] + + sx * bytes_per_sample; + unsigned char *dst_row = dst_img->planes[plane]; + unsigned int row_bytes = cw * (unsigned int)bytes_per_sample; + + for (unsigned int y = 0; y < ch; y++) { + memcpy(dst_row, src_row, row_bytes); + src_row += src_img->stride[plane]; + dst_row += dst_img->stride[plane]; + } + } +} + +static void shared_source_destroy(SharedSourceReader *src) { + if (!src->initialized) return; + if (src->input.file) fclose(src->input.file); + avm_img_free(&src->raw); + src->initialized = 0; +} + +// Forward declaration — defined after get_frame_to_encode +static int mlayer_crop_differs(const XLayerEncConfig *xlcfg, int ml); + +// Initialize a single xlayer encoder from its config entry. +// Uses the global config for defaults that aren't overridden per-layer. +// When use_shared_source is true, input file opening is skipped (source is +// provided externally via crop_region_to_xlayer). +static int init_xlayer_encoder(XLayerEncoderState *state, + const XLayerEncConfig *xlcfg, + const MultiXLayerConfig *mcfg, + const struct AvxEncoderConfig *global, + int use_shared_source) { + avm_codec_iface_t *iface = get_avm_encoder_by_short_name("av2"); + if (!iface) { + fprintf(stderr, "Error: AV2 encoder not available\n"); + return -1; + } + + state->xlayer_id = xlcfg->xlayer_id; + state->frames_out = 0; + state->frame_count = 0; + state->cx_time = 0; + state->eof = 0; + state->allocated_raw_shift = 0; + state->input_shift = 0; + + // Open input file (skip when using shared source — frames come from crop) + if (!use_shared_source) { + memset(&state->input, 0, sizeof(state->input)); + state->input.filename = xlcfg->input_filename; + state->input.framerate.numerator = 30; + state->input.framerate.denominator = 1; + state->input.only_i420 = 0; + state->input.bit_depth = 0; + + if (open_and_detect_input(&state->input, xlcfg->input_filename) != 0) { + fprintf(stderr, "Error: failed to open input for xlayer %d\n", + xlcfg->xlayer_id); + return -1; + } + + // Override dimensions from config if specified + if (xlcfg->width > 0) state->input.width = xlcfg->width; + if (xlcfg->height > 0) state->input.height = xlcfg->height; + if (state->input.fmt == 0) + state->input.fmt = AVM_IMG_FMT_I420; // default, profile may override + } else { + // Shared source mode: dimensions come from xlayer config + memset(&state->input, 0, sizeof(state->input)); + state->input.width = xlcfg->width; + state->input.height = xlcfg->height; + state->input.fmt = AVM_IMG_FMT_I420; // default, profile may override + state->input.framerate.numerator = 30; + state->input.framerate.denominator = 1; + } + + // Derive input image format from profile (chroma subsampling) + switch (xlcfg->profile) { + case MAIN_422_10_IP1: state->input.fmt = AVM_IMG_FMT_I422; break; + case MAIN_444_10_IP1: state->input.fmt = AVM_IMG_FMT_I444; break; + default: // MAIN_420_10_IP0..MAIN_420_10: 4:2:0 + // Keep whatever was detected from file, or default I420 + if (state->input.fmt != AVM_IMG_FMT_I422 && + state->input.fmt != AVM_IMG_FMT_I444) + state->input.fmt = AVM_IMG_FMT_I420; + break; + } + + // Get default encoder config + avm_codec_err_t res = avm_codec_enc_config_default(iface, &state->cfg, 0); + if (res) { + fprintf(stderr, "Error: failed to get default config for xlayer %d\n", + xlcfg->xlayer_id); + return -1; + } + + // Set dimensions + state->cfg.g_w = state->input.width; + state->cfg.g_h = state->input.height; + + // Set timebase from input framerate (or global). + // When using a named input source with an explicit frame rate, use that + // rate so the encoder's internal timing matches the source content rate. + if (xlcfg->input_source_idx >= 0 && + mcfg->input_sources[xlcfg->input_source_idx].frame_rate_num > 0) { + state->cfg.g_timebase.num = + mcfg->input_sources[xlcfg->input_source_idx].frame_rate_den; + state->cfg.g_timebase.den = + mcfg->input_sources[xlcfg->input_source_idx].frame_rate_num; + } else if (global->have_framerate) { + state->cfg.g_timebase.num = global->framerate.den; + state->cfg.g_timebase.den = global->framerate.num; + } else { + state->cfg.g_timebase.num = state->input.framerate.denominator; + state->cfg.g_timebase.den = state->input.framerate.numerator; + } + + // Set profile + state->cfg.g_profile = xlcfg->profile; + + // Set rate control: use QP if specified, otherwise use global settings + if (xlcfg->qp >= 0) { + state->cfg.rc_end_usage = AVM_Q; + // use_fixed_qp_offsets=1 tells the rate control to honor the specified QP + // directly, bypassing adaptive KF quality boosting that would otherwise + // ignore the QP and encode keyframes at minimum quantizer. + state->cfg.use_fixed_qp_offsets = 1; + state->cfg.rc_min_quantizer = 0; + state->cfg.rc_max_quantizer = 255; + } else if (xlcfg->bitrate > 0) { + state->cfg.rc_end_usage = AVM_VBR; + state->cfg.rc_target_bitrate = xlcfg->bitrate; + } + + // Set lag_in_frames + if (xlcfg->lag_in_frames >= 0) { + state->cfg.g_lag_in_frames = xlcfg->lag_in_frames; + } + + // Set keyframe interval. + // For multi-mlayer xlayers with lag_in_frames == 0, disable encoder-internal + // keyframe placement because the encoder's keyframe counter advances per + // encode call (not per TU), causing misaligned keyframes across mlayers. + // The xlayer encode loop manages keyframes externally via AVM_EFLAG_FORCE_KF. + // For multi-mlayer with lag_in_frames > 0, use multi_layers_lag_test which + // fixes the per-encode-call counter and enables forward keyframe support. + if (xlcfg->num_embedded_layers > 1 && state->cfg.g_lag_in_frames == 0) { + state->cfg.kf_mode = AVM_KF_DISABLED; + // Set kf_max_dist to the spec conformance limit for display_order_hint: + // get_disp_order_hint must return < (1 << (DISPLAY_ORDER_HINT_BITS - 1)). + state->cfg.kf_max_dist = (1 << 29); + } else if (xlcfg->kf_max_dist >= 0) { + state->cfg.kf_max_dist = xlcfg->kf_max_dist; + } + + // Enable LCR and OPS based on config + state->cfg.enable_lcr = + (mcfg->enable_global_lcr || mcfg->enable_local_lcr) ? 1 : 0; + if (mcfg->num_ops_sets > 0) { + state->cfg.enable_ops = 1; + } + + // Set bit depth based on profile (all standard AV2 profiles are 10-bit) + state->cfg.g_bit_depth = AVM_BITS_10; +#if CONFIG_TESTONLY_12BIT_SUPPORT + if (xlcfg->profile == TEST_ONLY_12BIT_PROFILE) + state->cfg.g_bit_depth = AVM_BITS_12; +#endif + state->cfg.g_input_bit_depth = + state->input.bit_depth > 0 ? state->input.bit_depth : 8; + state->input_shift = + (int)state->cfg.g_bit_depth - (int)state->cfg.g_input_bit_depth; + + // Set fwd_kf_enabled from GOP mode (must be set before encoder init) + { + int fwd_kf = 0; + switch (xlcfg->gop_mode) { + case 1: // open_leading + case 2: // open_sef + fwd_kf = 1; + break; + default: // 0 = closed + fwd_kf = 0; + break; + } + if (xlcfg->fwd_kf_enabled >= 0) fwd_kf = xlcfg->fwd_kf_enabled; + state->cfg.fwd_kf_enabled = fwd_kf; + } + + // Set S-Frame pre-init config fields + if (xlcfg->sframe_dist >= 0) { + state->cfg.sframe_dist = (unsigned int)xlcfg->sframe_dist; + } + if (xlcfg->sframe_mode >= 0) { + state->cfg.sframe_mode = (unsigned int)xlcfg->sframe_mode; + } + if (xlcfg->sframe_type >= 0) { + state->cfg.sframe_type = (unsigned int)xlcfg->sframe_type; + } + + // Initialize encoder with reporting flags matching single-stream path + int flags = avx_encoder_init_flags(global); + res = avm_codec_enc_init(&state->encoder, iface, &state->cfg, flags); + if (res) { + fprintf(stderr, "Error: encoder init failed for xlayer %d: %s\n", + xlcfg->xlayer_id, avm_codec_error(&state->encoder)); + return -1; + } + + // Apply encoder controls + int cpu = xlcfg->cpu_used >= 0 ? xlcfg->cpu_used : 5; + avm_codec_control(&state->encoder, AVME_SET_CPUUSED, cpu); + avm_codec_control(&state->encoder, AVME_SET_XLAYER_ID, xlcfg->xlayer_id); + + // Set QP via codec control (not via rc_min/max_quantizer) + if (xlcfg->qp >= 0) { + avm_codec_control(&state->encoder, AVME_SET_QP, (unsigned int)xlcfg->qp); + } + + // Apply GOP mode controls (post-init codec controls) + // Note: kf_filt (keyframe filtering) is independent of GOP mode. The first + // frame is always a displayed CLK. For open GOP modes, fwd_kf_enabled=1 + // (set pre-init above) causes subsequent keyframes to be OLK. In AV2, + // OLK frames can be displayed directly — they do not need to be hidden. + // kf_filt can be set separately via the "enable_keyframe_filtering" config. + { + int kf_filt = 0, sef_hidden = 0, intra_only_fwd = 0; + switch (xlcfg->gop_mode) { + case 1: // open_leading: OLK at subsequent GOP boundaries + sef_hidden = 0; + break; + case 2: // open_sef (monotonic: hidden INTRA_ONLY_FRAME + SEF) + sef_hidden = 1; + if (mcfg->monotonic_output_order) intra_only_fwd = 1; + break; + default: // 0 = closed + sef_hidden = 0; + break; + } + // Monotonic output requires SEF for all hidden frames — implicit output + // is not allowed when monotonic_output_order_flag is set. + if (mcfg->monotonic_output_order) sef_hidden = 1; + + if (xlcfg->enable_keyframe_filtering >= 0) + kf_filt = xlcfg->enable_keyframe_filtering; + if (xlcfg->add_sef_for_hidden_frames >= 0) + sef_hidden = xlcfg->add_sef_for_hidden_frames; + + avm_codec_control(&state->encoder, AV2E_SET_ENABLE_KEYFRAME_FILTERING, + (unsigned int)kf_filt); + avm_codec_control(&state->encoder, AV2E_SET_ADD_SEF_FOR_HIDDEN_FRAMES, + sef_hidden); + if (intra_only_fwd) { + avm_codec_control(&state->encoder, AV2E_SET_INTRA_ONLY_FWD_KF, 1); + } + } + + // Enable multi_layers_lag_test for multi-mlayer with lag > 0. + // This fixes per-encode-call keyframe counting and GF group management. + // The GF interval must be set to (lag - 1) / num_mlayers to account for + // mlayer interleaving in the lookahead — each source frame generates + // num_mlayers encode calls, so the effective lag in source frames is + // lag / num_mlayers. Without this, the GF group is too large for the + // lookahead and the encoder never produces output beyond the keyframe. + if (xlcfg->num_embedded_layers > 1 && state->cfg.g_lag_in_frames > 0) { + avm_codec_control(&state->encoder, + AV2E_SET_ENABLE_FLAG_MULTI_LAYER_LAG_TEST, 1); + int gop_size = + (state->cfg.g_lag_in_frames - 1) / xlcfg->num_embedded_layers; + avm_codec_control(&state->encoder, AV2E_SET_MIN_GF_INTERVAL, gop_size); + avm_codec_control(&state->encoder, AV2E_SET_MAX_GF_INTERVAL, gop_size); + } + + if (xlcfg->num_embedded_layers > 1) { + avm_codec_control(&state->encoder, AVME_SET_NUMBER_MLAYERS, + xlcfg->num_embedded_layers); + } + if (xlcfg->num_temporal_layers > 1) { + avm_codec_control(&state->encoder, AVME_SET_NUMBER_TLAYERS, + xlcfg->num_temporal_layers); + } + + if (mcfg->monotonic_output_order) { + avm_codec_control(&state->encoder, AV2E_SET_MONOTONIC_OUTPUT_ORDER, 1); + } + + // Propagate xlayer-level color configuration to the encoder. + // These were previously parsed from JSON but never applied. + if (xlcfg->color_primaries >= 0) + avm_codec_control(&state->encoder, AV2E_SET_COLOR_PRIMARIES, + (unsigned int)xlcfg->color_primaries); + if (xlcfg->transfer_characteristics >= 0) + avm_codec_control(&state->encoder, AV2E_SET_TRANSFER_CHARACTERISTICS, + (unsigned int)xlcfg->transfer_characteristics); + if (xlcfg->matrix_coefficients >= 0) + avm_codec_control(&state->encoder, AV2E_SET_MATRIX_COEFFICIENTS, + (unsigned int)xlcfg->matrix_coefficients); + if (xlcfg->full_range_flag >= 0) + avm_codec_control(&state->encoder, AV2E_SET_COLOR_RANGE, + (unsigned int)xlcfg->full_range_flag); + + // Apply per-mlayer CI overrides (after resolving inheritance from xlayer). + // Only set controls for mlayers whose CI differs from the xlayer base. + for (int m = 0; m < xlcfg->num_embedded_layers; m++) { + const MLayerSourceConfig *ms = &xlcfg->mlayer_sources[m]; + if (ms->color_primaries >= 0 && + ms->color_primaries != xlcfg->color_primaries) + avm_codec_control(&state->encoder, AV2E_SET_MLAYER_COLOR_PRIMARIES, + (unsigned int)m, (unsigned int)ms->color_primaries); + if (ms->transfer_characteristics >= 0 && + ms->transfer_characteristics != xlcfg->transfer_characteristics) + avm_codec_control( + &state->encoder, AV2E_SET_MLAYER_TRANSFER_CHARACTERISTICS, + (unsigned int)m, (unsigned int)ms->transfer_characteristics); + if (ms->matrix_coefficients >= 0 && + ms->matrix_coefficients != xlcfg->matrix_coefficients) + avm_codec_control(&state->encoder, AV2E_SET_MLAYER_MATRIX_COEFFICIENTS, + (unsigned int)m, (unsigned int)ms->matrix_coefficients); + if (ms->full_range_flag >= 0 && + ms->full_range_flag != xlcfg->full_range_flag) + avm_codec_control(&state->encoder, AV2E_SET_MLAYER_COLOR_RANGE, + (unsigned int)m, (unsigned int)ms->full_range_flag); + } + + // Apply mlayer dependency controls + if (xlcfg->has_mlayer_dependencies) { + avm_codec_control(&state->encoder, AV2E_SET_MLAYER_DEPENDENCY_PRESENT, + (unsigned int)1); + for (int m = 0; m < xlcfg->num_embedded_layers; m++) { + unsigned int mask = + (unsigned int)resolve_mlayer_dep_mask(&xlcfg->mlayer_sources[m], m); + avm_codec_control(&state->encoder, AV2E_SET_MLAYER_DEPENDENCY_MAP, + (unsigned int)m, mask); + } + } + + // Apply per-xlayer sub-GOP config if specified + if (xlcfg->subgop_config_path[0] != '\0') { + avm_codec_control(&state->encoder, AV2E_SET_SUBGOP_CONFIG_PATH, + xlcfg->subgop_config_path); + } + + // Apply generic codec controls from JSON "codec_controls" array. + // Each control is a (name, value) pair mapped to an AV2E_SET_* control ID. + { + static const struct { + const char *name; + int ctrl_id; + } ctrl_map[] = { + { "enable_deblocking", AV2E_SET_ENABLE_DEBLOCKING }, + { "enable_cdef", AV2E_SET_ENABLE_CDEF }, + { "enable_restoration", AV2E_SET_ENABLE_RESTORATION }, + { "enable_tpl_model", AV2E_SET_ENABLE_TPL_MODEL }, + { "enable_keyframe_filtering", AV2E_SET_ENABLE_KEYFRAME_FILTERING }, + { "enable_global_motion", AV2E_SET_ENABLE_GLOBAL_MOTION }, + { "enable_warped_motion", AV2E_SET_ENABLE_WARPED_MOTION }, + { "enable_intrabc", AV2E_SET_ENABLE_INTRABC }, + { "enable_palette", AV2E_SET_ENABLE_PALETTE }, + { "enable_interintra_comp", AV2E_SET_ENABLE_INTERINTRA_COMP }, + { "enable_smooth_interintra", AV2E_SET_ENABLE_SMOOTH_INTERINTRA }, + { "enable_interintra_wedge", AV2E_SET_ENABLE_INTERINTRA_WEDGE }, + { "enable_onesided_comp", AV2E_SET_ENABLE_ONESIDED_COMP }, + { "enable_masked_comp", AV2E_SET_ENABLE_MASKED_COMP }, + { "enable_diff_wtd_comp", AV2E_SET_ENABLE_DIFF_WTD_COMP }, + { "enable_interinter_wedge", AV2E_SET_ENABLE_INTERINTER_WEDGE }, + { "enable_ref_frame_mvs", AV2E_SET_ENABLE_REF_FRAME_MVS }, + { "enable_overlay", AV2E_SET_ENABLE_OVERLAY }, + { "enable_angle_delta", AV2E_SET_ENABLE_ANGLE_DELTA }, + }; + static const int num_ctrl_map = + (int)(sizeof(ctrl_map) / sizeof(ctrl_map[0])); + + for (int c = 0; c < xlcfg->num_codec_controls; c++) { + const char *name = xlcfg->codec_controls[c].name; + int value = xlcfg->codec_controls[c].value; + int found = 0; + for (int k = 0; k < num_ctrl_map; k++) { + if (strcmp(name, ctrl_map[k].name) == 0) { + avm_codec_control(&state->encoder, ctrl_map[k].ctrl_id, value); + found = 1; + break; + } + } + if (!found) { + fprintf(stderr, + "Warning: xlayer %d unknown codec_control \"%s\" (ignored)\n", + xlcfg->xlayer_id, name); + } + } + } + + // Allocate raw frame buffer + if (use_shared_source || state->input.file_type != FILE_TYPE_Y4M) { + if (!avm_img_alloc(&state->raw, state->input.fmt, state->input.width, + state->input.height, 32)) { + fprintf(stderr, "Error: failed to allocate image for xlayer %d\n", + xlcfg->xlayer_id); + return -1; + } + } else { + memset(&state->raw, 0, sizeof(state->raw)); + } + + // Allocate per-mlayer raw frame buffers for mlayers with their own source + if (xlcfg->has_per_mlayer_sources) { + for (int m = 0; m < xlcfg->num_embedded_layers; m++) { + const MLayerSourceConfig *ms = &xlcfg->mlayer_sources[m]; + if (ms->input_source_idx >= 0 && + (ms->input_source_idx != xlcfg->input_source_idx || + mlayer_crop_differs(xlcfg, m))) { + unsigned int mw = ms->width > 0 ? ms->width : state->input.width; + unsigned int mh = ms->height > 0 ? ms->height : state->input.height; + if (!avm_img_alloc(&state->mlayer_raw[m], state->input.fmt, mw, mh, + 32)) { + fprintf(stderr, + "Error: failed to allocate mlayer %d image for xlayer %d\n", + m, xlcfg->xlayer_id); + return -1; + } + state->mlayer_raw_allocated[m] = 1; + } + } + } + + fprintf(stderr, "Initialized xlayer %d: %ux%u%s\n", xlcfg->xlayer_id, + state->input.width, state->input.height, + use_shared_source ? " (shared source)" : ""); + + return 0; +} + +// Read one frame from an xlayer's input. Returns 1 if a frame is available. +static int read_xlayer_frame(XLayerEncoderState *state) { + if (state->eof) return 0; + + int frame_avail; + if (state->input.file_type == FILE_TYPE_Y4M) { + frame_avail = (y4m_input_fetch_frame(&state->input.y4m, state->input.file, + &state->raw) >= 1); + } else { + frame_avail = (read_yuv_frame(&state->input, &state->raw) == 0); + } + + if (!frame_avail) { + state->eof = 1; + return 0; + } + return 1; +} + +// Upshift a raw frame to the encoder's internal bit depth if needed. +// Lazily allocates the shift buffer on first use. Returns the frame +// pointer the encoder should consume (either the original or shifted). +static avm_image_t *upshift_frame_if_needed(avm_image_t *raw, + avm_image_t *raw_shift, + int *allocated_shift, + int input_shift, + int input_bit_depth) { + if (input_shift || input_bit_depth == 8) { + if (!*allocated_shift) { + avm_img_alloc(raw_shift, raw->fmt | AVM_IMG_FMT_HIGHBITDEPTH, raw->d_w, + raw->d_h, 32); + *allocated_shift = 1; + } + avm_img_upshift(raw_shift, raw, input_shift); + return raw_shift; + } + return raw; +} + +// Check if an mlayer has different crop coordinates than the xlayer +static int mlayer_crop_differs(const XLayerEncConfig *xlcfg, int ml) { + const MLayerSourceConfig *ms = &xlcfg->mlayer_sources[ml]; + if (ms->atlas_pos_x >= 0 && ms->atlas_pos_x != xlcfg->atlas_pos_x) return 1; + if (ms->atlas_pos_y >= 0 && ms->atlas_pos_y != xlcfg->atlas_pos_y) return 1; + if (ms->width > 0 && ms->width != xlcfg->width) return 1; + if (ms->height > 0 && ms->height != xlcfg->height) return 1; + return 0; +} + +// Set scaling mode and mlayer_id controls for multi-layer encoding. +// No-op when n_ml <= 1 (single embedded layer). +// When use_internal_kf is true, the encoder manages mlayer switching internally +// (multi_layers_lag_test mode), so AVME_SET_MLAYER_ID is not set. +static void apply_mlayer_settings(avm_codec_ctx_t *encoder, int n_ml, int ml, + const int *scaling_modes, + int use_internal_kf) { + if (n_ml <= 1) return; + + // Set scaling mode for every embedded layer + int sm = scaling_modes[ml]; + struct avm_scaling_mode mode = { sm, sm }; + avm_codec_control(encoder, AVME_SET_SCALEMODE, &mode); + + // Only set mlayer_id explicitly in non-internal-kf mode + if (!use_internal_kf) { + avm_codec_control(encoder, AVME_SET_MLAYER_ID, (unsigned int)ml); + } +} + +// Destroy an xlayer encoder state +static void destroy_xlayer_encoder(XLayerEncoderState *state) { + avm_codec_destroy(&state->encoder); + if (state->input.file) fclose(state->input.file); + avm_img_free(&state->raw); + if (state->allocated_raw_shift) avm_img_free(&state->raw_shift); + for (int m = 0; m < MAX_NUM_MLAYERS; m++) { + if (state->mlayer_raw_allocated[m]) avm_img_free(&state->mlayer_raw[m]); + if (state->mlayer_raw_shift_allocated[m]) + avm_img_free(&state->mlayer_raw_shift[m]); + } +} + +// Per-xlayer packet buffer for collecting encoder output +typedef struct XLayerPacketBuf { + uint8_t *data; + size_t size; + size_t capacity; + int has_keyframe; + int has_data; +} XLayerPacketBuf; + +static void pktbuf_init(XLayerPacketBuf *pb) { + memset(pb, 0, sizeof(*pb)); + pb->capacity = 64 * 1024; + pb->data = (uint8_t *)malloc(pb->capacity); +} + +static void pktbuf_reset(XLayerPacketBuf *pb) { + pb->size = 0; + pb->has_keyframe = 0; + pb->has_data = 0; +} + +static void pktbuf_free(XLayerPacketBuf *pb) { + free(pb->data); + pb->data = NULL; + pb->size = 0; + pb->capacity = 0; +} + +static int pktbuf_append(XLayerPacketBuf *pb, const uint8_t *data, size_t sz) { + size_t needed = pb->size + sz; + if (needed > pb->capacity) { + size_t new_cap = pb->capacity * 2; + if (new_cap < needed) new_cap = needed; + uint8_t *new_buf = (uint8_t *)realloc(pb->data, new_cap); + if (!new_buf) return -1; + pb->data = new_buf; + pb->capacity = new_cap; + } + memcpy(pb->data + pb->size, data, sz); + pb->size += sz; + return 0; +} + +// Drain all pending packets from an encoder into a packet buffer. +// Returns 1 if any frame packet was collected, 0 otherwise. +static int drain_encoder_packets(avm_codec_ctx_t *encoder, + XLayerEncoderState *state, + XLayerPacketBuf *pb) { + int got_data = 0; + avm_codec_iter_t iter = NULL; + const avm_codec_cx_pkt_t *pkt; + while ((pkt = avm_codec_get_cx_data(encoder, &iter))) { + if (pkt->kind == AVM_CODEC_CX_FRAME_PKT) { + pb->has_data = 1; + got_data = 1; + state->frames_out++; + if (pkt->data.frame.flags & AVM_FRAME_IS_KEY) { + pb->has_keyframe = 1; + } + pktbuf_append(pb, (const uint8_t *)pkt->data.frame.buf, + pkt->data.frame.sz); + } + } + return got_data; +} + +// Assemble a TU from collected per-xlayer packet buffers and write to file. +// Sets *first_output to 0 after writing structural OBUs. +// Write combined TUs from multiple xlayers' internal-KF encoder output. +// Each xlayer's pktbuf is parsed into TU segments (split at TD boundaries), +// then matching segments across xlayers are combined into single output TUs. +// This ensures all xlayers' frames for the same temporal unit share one TD +// and one set of structural OBUs, satisfying the DOH constraint. +static void write_combined_internal_kf_tus(TUAssembler *tu_asm, + const MultiXLayerConfig *mcfg, + const XLayerEncoderState *states, + const XLayerPacketBuf *pktbufs, + int num_xlayers, int *first_output, + FILE *outfile, int verbose, + int *tu_count) { + // Parse each xlayer's output into TU segments + TUSegmentInfo xl_segs[MAX_NUM_XLAYERS - 1][MAX_TU_SEGMENTS]; + int xl_nseg[MAX_NUM_XLAYERS - 1]; + int max_nseg = 0; + + for (int i = 0; i < num_xlayers; i++) { + if (pktbufs[i].has_data) { + xl_nseg[i] = tu_assembler_parse_tu_segments( + pktbufs[i].data, pktbufs[i].size, xl_segs[i], MAX_TU_SEGMENTS); + if (xl_nseg[i] > max_nseg) max_nseg = xl_nseg[i]; + } else { + xl_nseg[i] = 0; + } + } + + // Write one combined TU per segment position + for (int t = 0; t < max_nseg; t++) { + tu_asm->size = 0; + tu_assembler_write_td(tu_asm); + + int any_kf = 0; + for (int i = 0; i < num_xlayers; i++) { + if (t < xl_nseg[i] && xl_segs[i][t].has_keyframe) any_kf = 1; + } + + int emit_local_lcr = + mcfg->enable_local_lcr && ((*first_output && !any_kf) || any_kf); + tu_assembler_write_structural_obus(tu_asm, mcfg, first_output, any_kf); + + for (int i = 0; i < num_xlayers; i++) { + if (t < xl_nseg[i]) { + // Emit local LCR right before this xlayer's data (per spec: local + // config precedes the xlayer's SH/frame OBUs within each xlayer group) + if (emit_local_lcr) tu_assembler_write_local_lcr(tu_asm, i); + tu_assembler_append_xlayer_obus(tu_asm, states[i].xlayer_id, + pktbufs[i].data + xl_segs[i][t].offset, + xl_segs[i][t].size); + } + } + if (verbose) tu_assembler_print_contents(tu_asm, (*tu_count)); + (*tu_count)++; + tu_assembler_flush(tu_asm, outfile); + } +} + +int encode_multi_xlayer(const MultiXLayerConfig *mcfg, + const struct AvxEncoderConfig *global) { + const int num_xlayers = mcfg->num_xlayers; + XLayerEncoderState *states = NULL; + XLayerPacketBuf *pktbufs = NULL; + TUAssembler tu_asm; + SharedSourceReader shared_srcs[MAX_INPUT_SOURCES]; + int num_shared_srcs = mcfg->num_input_sources; + FILE *outfile = NULL; + int ret = -1; + int use_shared_source = (mcfg->num_input_sources > 0); + + // Merge CLI and JSON limits (CLI overrides JSON) + int limit = global->limit; + if (limit <= 0 && mcfg->limit > 0) limit = mcfg->limit; + + memset(&tu_asm, 0, sizeof(tu_asm)); + memset(shared_srcs, 0, sizeof(shared_srcs)); + + // Allocate per-xlayer encoder states and packet buffers + states = (XLayerEncoderState *)calloc(num_xlayers, sizeof(*states)); + pktbufs = (XLayerPacketBuf *)calloc(num_xlayers, sizeof(*pktbufs)); + if (!states || !pktbufs) { + fprintf(stderr, "Error: failed to allocate xlayer encoder states\n"); + goto cleanup; + } + for (int i = 0; i < num_xlayers; i++) pktbuf_init(&pktbufs[i]); + + // Initialize shared source readers for each input source + if (use_shared_source) { + for (int s = 0; s < num_shared_srcs; s++) { + if (shared_source_init(&shared_srcs[s], &mcfg->input_sources[s], mcfg) != + 0) + goto cleanup; + } + } + + // Initialize TU assembler + if (tu_assembler_init(&tu_asm, mcfg) != 0) { + fprintf(stderr, "Error: failed to initialize TU assembler\n"); + goto cleanup; + } + + // Open output file + const char *outpath = mcfg->output_filename; + if (outpath[0] == '\0') { + fprintf(stderr, "Error: no output filename specified in xlayer config\n"); + goto cleanup; + } + outfile = fopen(outpath, "wb"); + if (!outfile) { + fprintf(stderr, "Error: cannot open output file \"%s\"\n", outpath); + goto cleanup; + } + + // Initialize all xlayer encoders + for (int i = 0; i < num_xlayers; i++) { + int xl_uses_shared = (mcfg->xlayers[i].input_source_idx >= 0); + if (init_xlayer_encoder(&states[i], &mcfg->xlayers[i], mcfg, global, + xl_uses_shared) != 0) { + goto cleanup; + } + } + + fprintf(stderr, "Multi-xlayer encoding: %d xlayers, output=\"%s\"\n", + num_xlayers, outpath); + + // Pre-index: for each input source, store which xlayer indices use it. + // Avoids O(num_xlayers) scan per source per frame in the hot loop. + int src_xl_count[MAX_INPUT_SOURCES] = { 0 }; + int src_xl_indices[MAX_INPUT_SOURCES][MAX_NUM_XLAYERS - 1]; + for (int i = 0; i < num_xlayers; i++) { + int sidx = mcfg->xlayers[i].input_source_idx; + if (sidx >= 0 && sidx < MAX_INPUT_SOURCES) { + src_xl_indices[sidx][src_xl_count[sidx]++] = i; + } + } + + // Main encoding loop + unsigned int frame_idx = 0; + int any_active = 1; + int first_output = 1; + int tu_count = 0; + const int verbose = global->verbose; + + while (any_active) { + any_active = 0; + + if (limit > 0 && (int)frame_idx >= limit) break; + + // Read frames: from shared sources and/or per-xlayer inputs + // Only read from sources whose frame_skip aligns with this TU + for (int s = 0; s < num_shared_srcs; s++) { + if (!shared_srcs[s].initialized || shared_srcs[s].eof) continue; + int skip = mcfg->input_sources[s].frame_skip; + if (skip > 1 && (frame_idx % (unsigned int)skip) != 0) continue; + if (!shared_source_read_frame(&shared_srcs[s])) { + // Mark all xlayers using this source as EOF + for (int j = 0; j < src_xl_count[s]; j++) + states[src_xl_indices[s][j]].eof = 1; + } else { + // Crop regions for xlayers using this source + for (int j = 0; j < src_xl_count[s]; j++) { + int i = src_xl_indices[s][j]; + crop_region_to_xlayer( + &states[i].raw, &shared_srcs[s].raw, mcfg->xlayers[i].atlas_pos_x, + mcfg->xlayers[i].atlas_pos_y, mcfg->xlayers[i].width, + mcfg->xlayers[i].height); + } + } + } + + // Read from per-xlayer inputs for xlayers not using any shared source + for (int i = 0; i < num_xlayers; i++) { + if (mcfg->xlayers[i].input_source_idx < 0 && !states[i].eof) { + read_xlayer_frame(&states[i]); + } + } + + // Encode xlayers for this frame. + // Xlayers whose source is skipped this TU are not encoded. + // + // For multi-mlayer xlayers, keyframes are managed externally: the first + // frame is always a keyframe, and subsequent keyframes are placed at + // kf_max_dist intervals. When a TU is a keyframe, ALL mlayers get + // AVM_EFLAG_FORCE_KF so that CLK OBUs are aligned across layers (spec + // requirement: first mlayer and all independent mlayers must be CLK when + // any mlayer is CLK). + + // Reset packet buffers before encoding this TU + for (int i = 0; i < num_xlayers; i++) pktbuf_reset(&pktbufs[i]); + int got_data = 0; + + for (int i = 0; i < num_xlayers; i++) { + // Check if this xlayer's source is active this TU + int sidx = mcfg->xlayers[i].input_source_idx; + if (sidx >= 0) { + int skip = mcfg->input_sources[sidx].frame_skip; + if (skip > 1 && (frame_idx % (unsigned int)skip) != 0) continue; + } + + const XLayerEncConfig *xlcfg = &mcfg->xlayers[i]; + int n_ml = xlcfg->num_embedded_layers; + int use_internal_kf = (n_ml > 1 && states[i].cfg.g_lag_in_frames > 0); + + for (int ml = 0; ml < n_ml; ml++) { + avm_image_t *img = NULL; + if (!states[i].eof) { + if (xlcfg->has_per_mlayer_sources && + states[i].mlayer_raw_allocated[ml]) { + // Per-mlayer source: crop from the mlayer's own source + int msrc = xlcfg->mlayer_sources[ml].input_source_idx; + if (msrc >= 0 && shared_srcs[msrc].initialized && + !shared_srcs[msrc].eof) { + crop_region_to_xlayer(&states[i].mlayer_raw[ml], + &shared_srcs[msrc].raw, + xlcfg->mlayer_sources[ml].atlas_pos_x, + xlcfg->mlayer_sources[ml].atlas_pos_y, + xlcfg->mlayer_sources[ml].width, + xlcfg->mlayer_sources[ml].height); + } + img = upshift_frame_if_needed( + &states[i].mlayer_raw[ml], &states[i].mlayer_raw_shift[ml], + &states[i].mlayer_raw_shift_allocated[ml], + states[i].input_shift, states[i].input.bit_depth); + } else { + // Default: use xlayer's shared image + img = upshift_frame_if_needed(&states[i].raw, &states[i].raw_shift, + &states[i].allocated_raw_shift, + states[i].input_shift, + states[i].input.bit_depth); + } + } + + apply_mlayer_settings(&states[i].encoder, n_ml, ml, xlcfg->scaling_mode, + use_internal_kf); + + // For multi-mlayer with lag == 0: force KF on independent mlayers + // (dependency_mask == 0) on keyframe TUs. Dependent layers use + // inter-layer prediction from the KF of lower layers. + // For multi-mlayer with lag > 0: internal KF management handles + // keyframes via multi_layers_lag_test, so no external FORCE_KF. + // For single-mlayer: use standard encoder-internal keyframe handling. + int frame_flags = 0; + if (n_ml > 1 && !use_internal_kf) { + int is_kf_tu = (frame_idx == 0); + if (xlcfg->kf_max_dist > 0 && frame_idx > 0) { + is_kf_tu = (frame_idx % xlcfg->kf_max_dist == 0); + } + if (is_kf_tu) { + int mask = resolve_mlayer_dep_mask(&xlcfg->mlayer_sources[ml], ml); + if (mask == 0) frame_flags |= AVM_EFLAG_FORCE_KF; + } + } else { + if (frame_idx == 0) frame_flags |= AVM_EFLAG_FORCE_KF; + } + + struct avm_usec_timer timer; + avm_usec_timer_start(&timer); + + avm_codec_err_t res = avm_codec_encode( + &states[i].encoder, img, states[i].frame_count, 1, frame_flags); + avm_usec_timer_mark(&timer); + states[i].cx_time += avm_usec_timer_elapsed(&timer); + states[i].frame_count++; + + if (res != AVM_CODEC_OK) { + fprintf(stderr, + "Error: encode failed for xlayer %d frame %u ml %d: %s\n", + states[i].xlayer_id, frame_idx, ml, + avm_codec_error(&states[i].encoder)); + goto cleanup; + } + + // Drain packets immediately — the encoder clears its packet list + // on the next avm_codec_encode call, so we must collect before then. + if (drain_encoder_packets(&states[i].encoder, &states[i], &pktbufs[i])) + got_data = 1; + } + } + + // Assemble TU(s) from collected packets. + // Always use segment-based TU assembly: the encoder may emit TDs within + // a single packet blob (e.g. OLK in its own TU, then leading frames in + // subsequent TUs; or internal-KF mode with multiple TUs per GF group). + // The segment parser splits at TD boundaries and writes one output TU + // per segment, combining matching segments across xlayers. + if (got_data) { + write_combined_internal_kf_tus(&tu_asm, mcfg, states, pktbufs, + num_xlayers, &first_output, outfile, + verbose, &tu_count); + for (int i = 0; i < num_xlayers; i++) pktbuf_reset(&pktbufs[i]); + } + + // Check if any encoder still has input + for (int i = 0; i < num_xlayers; i++) { + if (!states[i].eof) any_active = 1; + } + + frame_idx++; + } + + // Flush all encoders. For internal KF mode (multi_layers_lag_test), + // each xlayer's encoder output may contain multiple TUs; we parse them + // into segments and combine matching segments across xlayers into shared + // TUs. For non-internal-KF mode, each flush round produces one TU. + // + // The internal pipeline may need many NULL pushes before it starts + // producing output (e.g. lag_in_frames rounds). We keep flushing until + // no data is produced for several consecutive rounds. + int flushing = 1; + int dry_rounds = 0; + const int max_dry_rounds = 50; // generous upper bound + while (flushing || dry_rounds < max_dry_rounds) { + flushing = 0; + + for (int i = 0; i < num_xlayers; i++) { + int n_ml = mcfg->xlayers[i].num_embedded_layers; + int internal_kf = (n_ml > 1 && states[i].cfg.g_lag_in_frames > 0); + + if (internal_kf) { + // Internal KF mode: the encoder manages mlayer switching internally. + // Push n_ml NULLs to advance all mlayers for one frame. Accumulate + // all output before combining with other xlayers. + pktbuf_reset(&pktbufs[i]); + for (int ml = 0; ml < n_ml; ml++) { + struct avm_usec_timer timer; + avm_usec_timer_start(&timer); + avm_codec_encode(&states[i].encoder, NULL, states[i].frame_count, 1, + 0); + avm_usec_timer_mark(&timer); + states[i].cx_time += avm_usec_timer_elapsed(&timer); + states[i].frame_count++; + + int got = drain_encoder_packets(&states[i].encoder, &states[i], + &pktbufs[i]); + if (got) flushing = 1; + } + } else { + // Non-internal-KF: flush each mlayer, one TU per flush round. + pktbuf_reset(&pktbufs[i]); + for (int ml = 0; ml < n_ml; ml++) { + apply_mlayer_settings(&states[i].encoder, n_ml, ml, + mcfg->xlayers[i].scaling_mode, internal_kf); + struct avm_usec_timer timer; + avm_usec_timer_start(&timer); + avm_codec_encode(&states[i].encoder, NULL, states[i].frame_count, 1, + 0); + avm_usec_timer_mark(&timer); + states[i].cx_time += avm_usec_timer_elapsed(&timer); + states[i].frame_count++; + + if (drain_encoder_packets(&states[i].encoder, &states[i], + &pktbufs[i])) + flushing = 1; + } + } + } + + // After all xlayers have been flushed for this round, write combined TUs. + if (flushing) { + write_combined_internal_kf_tus(&tu_asm, mcfg, states, pktbufs, + num_xlayers, &first_output, outfile, + verbose, &tu_count); + for (int i = 0; i < num_xlayers; i++) pktbuf_reset(&pktbufs[i]); + } + if (flushing) { + dry_rounds = 0; + } else { + dry_rounds++; + } + flushing = 0; + } + + // Print summary + fprintf(stderr, "\nMulti-xlayer encoding complete:\n"); + for (int i = 0; i < num_xlayers; i++) { + fprintf(stderr, " xlayer %d: %u frames, %.1fs (%.1f fps)\n", + states[i].xlayer_id, states[i].frames_out, + states[i].cx_time / 1000000.0, + states[i].frames_out > 0 + ? (double)states[i].frames_out / (states[i].cx_time / 1000000.0) + : 0.0); + } + + ret = 0; + +cleanup: + if (pktbufs) { + for (int i = 0; i < num_xlayers; i++) pktbuf_free(&pktbufs[i]); + free(pktbufs); + } + if (states) { + for (int i = 0; i < num_xlayers; i++) { + destroy_xlayer_encoder(&states[i]); + } + free(states); + } + tu_assembler_free(&tu_asm); + for (int s = 0; s < num_shared_srcs; s++) + shared_source_destroy(&shared_srcs[s]); + if (outfile) fclose(outfile); + return ret; +} diff --git a/apps/avmenc_xlayer.h b/apps/avmenc_xlayer.h new file mode 100644 index 0000000000..77cee603b1 --- /dev/null +++ b/apps/avmenc_xlayer.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#ifndef AVM_APPS_AVMENC_XLAYER_H_ +#define AVM_APPS_AVMENC_XLAYER_H_ + +#include "avm/avm_encoder.h" +#include "avm/avmcx.h" +#include "common/tools_common.h" +#include "common/xlayer_config.h" +#include "common/tu_assembler.h" +#include "apps/avmenc.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Per-xlayer encoder state +typedef struct XLayerEncoderState { + int xlayer_id; + struct AvxInputContext input; + avm_codec_ctx_t encoder; + avm_codec_enc_cfg_t cfg; + avm_image_t raw; + avm_image_t raw_shift; + int allocated_raw_shift; + int input_shift; + unsigned int frames_out; + uint32_t + frame_count; // PTS counter (advances per encode call, not per frame) + uint64_t cx_time; + int eof; // input exhausted + // Per-embedded-layer raw buffers (for per-mlayer input sources) + avm_image_t mlayer_raw[MAX_NUM_MLAYERS]; + avm_image_t mlayer_raw_shift[MAX_NUM_MLAYERS]; + int mlayer_raw_allocated[MAX_NUM_MLAYERS]; + int mlayer_raw_shift_allocated[MAX_NUM_MLAYERS]; +} XLayerEncoderState; + +// Run multi-xlayer encoding. Returns 0 on success. +int encode_multi_xlayer(const MultiXLayerConfig *mcfg, + const struct AvxEncoderConfig *global); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AVM_APPS_AVMENC_XLAYER_H_ diff --git a/av2/arg_defs.c b/av2/arg_defs.c index 3a825b7b9b..2cede62aa9 100644 --- a/av2/arg_defs.c +++ b/av2/arg_defs.c @@ -904,4 +904,6 @@ const av2_codec_arg_definitions_t g_av2_codec_arg_defs = { "Cross frame CDF for context initialization " "(0: disable cross frame CDF init, 1: enable cross frame CDF " "init(default), "), + .xlayer_config = ARG_DEF(NULL, "xlayer-config", 1, + "Path to JSON config for multi-xlayer encoding"), }; diff --git a/av2/arg_defs.h b/av2/arg_defs.h index 7fe25641bc..06a774a1b2 100644 --- a/av2/arg_defs.h +++ b/av2/arg_defs.h @@ -294,6 +294,7 @@ typedef struct av2_codec_arg_definitions { arg_def_t enable_mfh_obu_signaling; arg_def_t operating_points_count; arg_def_t cross_frame_cdf_init_mode; + arg_def_t xlayer_config; } av2_codec_arg_definitions_t; extern const av2_codec_arg_definitions_t g_av2_codec_arg_defs; diff --git a/av2/av2_cx_iface.c b/av2/av2_cx_iface.c index 845d11b06e..8650fa6211 100644 --- a/av2/av2_cx_iface.c +++ b/av2/av2_cx_iface.c @@ -245,6 +245,7 @@ struct av2_extracfg { int buffer_refresh_multi_layers_test[REF_FRAMES]; int multi_layers_lag_test; int force_deferred_frames_for_ras_test; + int intra_only_fwd_kf; }; // Example subgop configs. Currently not used by default. @@ -574,6 +575,7 @@ static struct av2_extracfg default_extra_cfg = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // buffer_refresh_multi_layers_test 0, // multi_layers_test for nozero lag 0, // force_deferred_frames_for_ras_test + 0, }; // clang-format on @@ -595,6 +597,9 @@ struct avm_codec_alg_priv { avm_enc_frame_flags_t next_frame_flags; avm_codec_pkt_list_decl(256) pkt_list; unsigned int fixed_kf_cntr; + // For multi-mlayer with lag: persists across encoder_encode calls to prevent + // TDs from being inserted between hidden frames within the same TU. + int mlayer_tu_ready; // BufferPool that holds all reference frames. BufferPool *buffer_pool; @@ -817,6 +822,7 @@ static avm_codec_err_t validate_config(avm_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, explicit_ref_frame_map, 0, 1); RANGE_CHECK(extra_cfg, add_sef_for_hidden_frames, 0, 1); RANGE_CHECK(extra_cfg, monotonic_output_order, 0, 1); + RANGE_CHECK(extra_cfg, intra_only_fwd_kf, 0, 1); if (extra_cfg->monotonic_output_order && extra_cfg->enable_keyframe_filtering > 0) ERROR("monotonic_output_order=1 requires enable_keyframe_filtering=0"); @@ -1515,6 +1521,7 @@ static avm_codec_err_t set_encoder_config(AV2EncoderConfig *oxcf, // Set Key frame configuration. kf_cfg->fwd_kf_enabled = cfg->fwd_kf_enabled; + kf_cfg->intra_only_fwd_kf = extra_cfg->intra_only_fwd_kf; kf_cfg->auto_key = cfg->kf_mode == AVM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist; kf_cfg->key_freq_min = cfg->kf_min_dist; @@ -1607,11 +1614,20 @@ static avm_codec_err_t set_encoder_config(AV2EncoderConfig *oxcf, oxcf->ref_frm_cfg.add_sef_for_hidden_frames = extra_cfg->add_sef_for_hidden_frames; oxcf->tool_cfg.monotonic_output_order = extra_cfg->monotonic_output_order; + // Monotonic output requires SEF OBUs for hidden frames — implicit output + // is not allowed when monotonic_output_order_flag is set in the sequence + // header. Force add_sef_for_hidden_frames on so the encoder produces SEF + // OBUs instead of relying on implicit output. if (oxcf->tool_cfg.monotonic_output_order && !oxcf->ref_frm_cfg.add_sef_for_hidden_frames) { - // `monotonic_output_order = 1` implies that `implicit_output_frame = 0`. - // So, explicit SEF OBUs must be signaled. oxcf->ref_frm_cfg.add_sef_for_hidden_frames = 1; + static int warned_sef_override; + if (!warned_sef_override) { + warned_sef_override = 1; + fprintf(stderr, + "Warning: monotonic_output_order=1 forces " + "add_sef_for_hidden_frames=1 (--add-sef-for-output=1)\n"); + } } oxcf->row_mt = extra_cfg->row_mt; @@ -2752,6 +2768,130 @@ static avm_codec_err_t ctrl_set_force_deferred_frames_for_ras_test( return update_extra_cfg(ctx, &extra_cfg); } +static avm_codec_err_t ctrl_set_xlayer_id(avm_codec_alg_priv_t *ctx, + va_list args) { + const int xlayer_id = va_arg(args, int); + if (xlayer_id < 0 || xlayer_id > 30) return AVM_CODEC_INVALID_PARAM; + ctx->cpi->common.xlayer_id = xlayer_id; + return AVM_CODEC_OK; +} + +static avm_codec_err_t ctrl_set_mlayer_dependency_present( + avm_codec_alg_priv_t *ctx, va_list args) { + const unsigned int flag = va_arg(args, unsigned int); + if (flag > 1) return AVM_CODEC_INVALID_PARAM; + ctx->cpi->common.seq_params.mlayer_dependency_present_flag = (int)flag; + return AVM_CODEC_OK; +} + +static avm_codec_err_t ctrl_set_mlayer_dependency_map(avm_codec_alg_priv_t *ctx, + va_list args) { + const unsigned int mlayer_idx = va_arg(args, unsigned int); + const unsigned int mask = va_arg(args, unsigned int); + if (mlayer_idx >= MAX_NUM_MLAYERS) return AVM_CODEC_INVALID_PARAM; + SequenceHeader *seq = &ctx->cpi->common.seq_params; + for (int j = 0; j < (int)mlayer_idx; j++) { + seq->mlayer_dependency_map[mlayer_idx][j] = (mask >> j) & 1; + } + // Self-dependency is always 1 + seq->mlayer_dependency_map[mlayer_idx][mlayer_idx] = 1; + return AVM_CODEC_OK; +} + +static avm_codec_err_t ctrl_set_intra_only_fwd_kf(avm_codec_alg_priv_t *ctx, + va_list args) { + struct av2_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.intra_only_fwd_kf = CAST(AV2E_SET_INTRA_ONLY_FWD_KF, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +// Helper to derive color_description_idc from CICP triplet. +static int derive_color_description_idc(avm_color_primaries_t cp, + avm_transfer_characteristics_t tc, + avm_matrix_coefficients_t mc) { + if (cp == AVM_CICP_CP_BT_709 && tc == AVM_CICP_TC_BT_709 && + mc == AVM_CICP_MC_BT_709) + return AVM_COLOR_DESC_IDC_BT709SDR; + if (cp == AVM_CICP_CP_BT_709 && tc == AVM_CICP_TC_SRGB && + mc == AVM_CICP_MC_IDENTITY) + return AVM_COLOR_DESC_IDC_SRGB; + if (cp == AVM_CICP_CP_BT_709 && tc == AVM_CICP_TC_SRGB && + mc == AVM_CICP_MC_BT_470_B_G) + return AVM_COLOR_DESC_IDC_SYCC; + if (cp == AVM_CICP_CP_BT_2020 && tc == AVM_CICP_TC_SMPTE_2084 && + mc == AVM_CICP_MC_BT_2020_NCL) + return AVM_COLOR_DESC_IDC_BT2100PQ; + if (cp == AVM_CICP_CP_BT_2020 && tc == AVM_CICP_TC_HLG && + mc == AVM_CICP_MC_BT_2020_NCL) + return AVM_COLOR_DESC_IDC_BT2100HLG; + return AVM_COLOR_DESC_IDC_EXPLICIT; +} + +// Helper to update ci_params_per_layer[ml] flags after a color field change. +static void update_mlayer_ci_flags(ContentInterpretation *ci) { + ColorInfo *c = &ci->color_info; + c->color_description_idc = derive_color_description_idc( + c->color_primaries, c->transfer_characteristics, c->matrix_coefficients); + + if (c->color_description_idc == AVM_COLOR_DESC_IDC_EXPLICIT && + c->color_primaries == AVM_CICP_CP_UNSPECIFIED && + c->transfer_characteristics == AVM_CICP_TC_UNSPECIFIED && + c->matrix_coefficients == AVM_CICP_MC_UNSPECIFIED && + c->full_range_flag == 0) { + ci->ci_color_description_present_flag = 0; + } else { + ci->ci_color_description_present_flag = 1; + } +} + +// Common helper for per-mlayer CI control handlers. +// field: 0=color_primaries, 1=transfer_characteristics, +// 2=matrix_coefficients, 3=full_range_flag +static avm_codec_err_t set_mlayer_ci_field(avm_codec_alg_priv_t *ctx, + va_list args, int field) { + const unsigned int mlayer_idx = va_arg(args, unsigned int); + const unsigned int value = va_arg(args, unsigned int); + if (mlayer_idx >= MAX_NUM_MLAYERS) return AVM_CODEC_INVALID_PARAM; + ContentInterpretation *ci = &ctx->cpi->common.ci_params_per_layer[mlayer_idx]; + switch (field) { + case 0: + ci->color_info.color_primaries = (avm_color_primaries_t)value; + break; + case 1: + ci->color_info.transfer_characteristics = + (avm_transfer_characteristics_t)value; + break; + case 2: + ci->color_info.matrix_coefficients = (avm_matrix_coefficients_t)value; + break; + case 3: ci->color_info.full_range_flag = value ? 1 : 0; break; + } + update_mlayer_ci_flags(ci); + ctx->cpi->write_ci_obu_flag = 1; + ctx->cpi->ci_per_layer_overridden[mlayer_idx] = 1; + return AVM_CODEC_OK; +} + +static avm_codec_err_t ctrl_set_mlayer_color_primaries( + avm_codec_alg_priv_t *ctx, va_list args) { + return set_mlayer_ci_field(ctx, args, 0); +} + +static avm_codec_err_t ctrl_set_mlayer_transfer_characteristics( + avm_codec_alg_priv_t *ctx, va_list args) { + return set_mlayer_ci_field(ctx, args, 1); +} + +static avm_codec_err_t ctrl_set_mlayer_matrix_coefficients( + avm_codec_alg_priv_t *ctx, va_list args) { + return set_mlayer_ci_field(ctx, args, 2); +} + +static avm_codec_err_t ctrl_set_mlayer_color_range(avm_codec_alg_priv_t *ctx, + va_list args) { + return set_mlayer_ci_field(ctx, args, 3); +} + static avm_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, STATS_BUFFER_CTX *stats_buf_context, int num_lap_buffers) { @@ -2819,6 +2959,7 @@ static avm_codec_err_t encoder_init(avm_codec_ctx_t *ctx) { } priv->extra_cfg = default_extra_cfg; + priv->mlayer_tu_ready = 1; // First frame starts a new TU. avm_once(av2_initialize_enc); res = validate_config(priv, &priv->cfg, &priv->extra_cfg); @@ -3065,18 +3206,19 @@ static void report_stats(AV2_COMP *cpi, size_t frame_size, uint64_t cx_time) { const bool use_hbd_psnr = (cpi->b_calculate_psnr == 2); if (cpi->oxcf.tool_cfg.enable_bru) { fprintf(stdout, - "POC:%6d [%s][BRU%1d:%1d][Level:%d][Q:%3d][LTID:%d]" - "[ELID:%d][TLID:%d]: %10" PRIu64 + "POC:%6d [XL:%d][%s][BRU%1d:%1d][Level:%d][Q:%3d][LTID:%d]" + "[ELID:%d][TLID:%d][OH:%d][DOH:%d]: %10" PRIu64 " Bytes, " "%6.1fms, %2.4f dB(Y), %2.4f dB(U), " "%2.4f dB(V), " "%2.4f dB(Avg)", - cm->cur_frame->absolute_poc, + cm->cur_frame->absolute_poc, cm->xlayer_id, frameType[cm->current_frame.frame_type + cpi->is_ras_frame], cm->bru.enabled, cm->bru.update_ref_idx, cm->cur_frame->pyramid_level, base_qindex, cm->cur_frame->long_term_id, cm->cur_frame->mlayer_id, - (int)cm->cur_frame->tlayer_id, (uint64_t)frame_size, + (int)cm->cur_frame->tlayer_id, cm->cur_frame->order_hint, + cm->cur_frame->display_order_hint, (uint64_t)frame_size, cx_time / 1000.0, use_hbd_psnr ? psnr.psnr_hbd[1] : psnr.psnr[1], use_hbd_psnr ? psnr.psnr_hbd[2] : psnr.psnr[2], @@ -3084,17 +3226,18 @@ static void report_stats(AV2_COMP *cpi, size_t frame_size, uint64_t cx_time) { use_hbd_psnr ? psnr.psnr_hbd[0] : psnr.psnr[0]); } else { fprintf(stdout, - "POC:%6d [%s][Level:%d][Q:%3d][LTID:%d]" - "[ELID:%d][TLID:%d]: %10" PRIu64 + "POC:%6d [XL:%d][%s][Level:%d][Q:%3d][LTID:%d]" + "[ELID:%d][TLID:%d][OH:%d][DOH:%d]: %10" PRIu64 " Bytes, " "%6.1fms, %2.4f dB(Y), %2.4f dB(U), " "%2.4f dB(V), " "%2.4f dB(Avg)", - cm->cur_frame->absolute_poc, + cm->cur_frame->absolute_poc, cm->xlayer_id, frameType[cm->current_frame.frame_type + cpi->is_ras_frame], cm->cur_frame->pyramid_level, base_qindex, cm->cur_frame->long_term_id, cm->cur_frame->mlayer_id, - (int)cm->cur_frame->tlayer_id, (uint64_t)frame_size, + (int)cm->cur_frame->tlayer_id, cm->cur_frame->order_hint, + cm->cur_frame->display_order_hint, (uint64_t)frame_size, cx_time / 1000.0, use_hbd_psnr ? psnr.psnr_hbd[1] : psnr.psnr[1], use_hbd_psnr ? psnr.psnr_hbd[2] : psnr.psnr[2], @@ -3104,28 +3247,30 @@ static void report_stats(AV2_COMP *cpi, size_t frame_size, uint64_t cx_time) { } else { if (cpi->oxcf.tool_cfg.enable_bru) { fprintf(stdout, - "POC:%6d [%s][BRU%1d:%1d][Level:%d][Q:%3d][LTID:%d]" - "[ELID:%d][TLID:%d]: %10" PRIu64 + "POC:%6d [XL:%d][%s][BRU%1d:%1d][Level:%d][Q:%3d][LTID:%d]" + "[ELID:%d][TLID:%d][OH:%d][DOH:%d]: %10" PRIu64 " Bytes, " "%6.1fms", - cm->cur_frame->absolute_poc, + cm->cur_frame->absolute_poc, cm->xlayer_id, frameType[cm->current_frame.frame_type + cpi->is_ras_frame], cm->bru.enabled, cm->bru.update_ref_idx, cm->cur_frame->pyramid_level, base_qindex, cm->cur_frame->long_term_id, cm->cur_frame->mlayer_id, - (int)cm->cur_frame->tlayer_id, (uint64_t)frame_size, + (int)cm->cur_frame->tlayer_id, cm->cur_frame->order_hint, + cm->cur_frame->display_order_hint, (uint64_t)frame_size, cx_time / 1000.0); } else { fprintf(stdout, - "POC:%6d [%s][Level:%d][Q:%3d][LTID:%d]" - "[ELID:%d][TLID:%d]: %10" PRIu64 + "POC:%6d [XL:%d][%s][Level:%d][Q:%3d][LTID:%d]" + "[ELID:%d][TLID:%d][OH:%d][DOH:%d]: %10" PRIu64 " Bytes, " "%6.1fms", - cm->cur_frame->absolute_poc, + cm->cur_frame->absolute_poc, cm->xlayer_id, frameType[cm->current_frame.frame_type + cpi->is_ras_frame], cm->cur_frame->pyramid_level, base_qindex, cm->cur_frame->long_term_id, cm->cur_frame->mlayer_id, - (int)cm->cur_frame->tlayer_id, (uint64_t)frame_size, + (int)cm->cur_frame->tlayer_id, cm->cur_frame->order_hint, + cm->cur_frame->display_order_hint, (uint64_t)frame_size, cx_time / 1000.0); } } @@ -3385,7 +3530,7 @@ static avm_codec_err_t encoder_encode(avm_codec_alg_priv_t *ctx, // Get the next visible frame. Invisible frames get packed with the next // visible frame. - int64_t dst_time_stamp; + int64_t dst_time_stamp = 0; int64_t dst_end_time_stamp; struct avm_usec_timer timer; if (cpi->compressor_stage == ENCODE_STAGE) { @@ -3395,7 +3540,12 @@ static avm_codec_err_t encoder_encode(avm_codec_alg_priv_t *ctx, cpi->subgop_stats.num_references[stat_idx] = -1; } } - int ready_for_next_tu = 1; + // In multi-mlayer with lag mode, TU boundaries persist across + // encoder_encode calls. Use the persistent flag so that hidden frames + // for ml>0 (processed in a separate call) don't start a new TU. + const int multi_ml_lag = cpi->oxcf.unit_test_cfg.multi_layers_lag_test && + cpi->common.number_mlayers > 1; + int ready_for_next_tu = multi_ml_lag ? ctx->mlayer_tu_ready : 1; while (cx_data_sz - index_size >= ctx->cx_data_sz / 2 && !is_frame_visible) { @@ -3455,9 +3605,25 @@ static avm_codec_err_t encoder_encode(avm_codec_alg_priv_t *ctx, ready_for_next_tu = 0; } - if (mlayer_id == 0 && (cpi->common.immediate_output_picture || - cpi->common.implicit_output_picture)) { - ready_for_next_tu = 1; + if (multi_ml_lag) { + // Multi-mlayer with lag: mark TU boundary after the last mlayer + // produces an output frame. In non-monotonic mode, implicit + // output frames are also output (decoder reorders), so each gets + // its own TU. In monotonic mode, only immediate output triggers + // a TU boundary (hidden frames are bundled with their SEF). + const int is_output = + cpi->common.immediate_output_picture || + (!cpi->common.seq_params.monotonic_output_order_flag && + cpi->common.implicit_output_picture); + if ((unsigned int)mlayer_id == cpi->common.number_mlayers - 1 && + is_output) { + ready_for_next_tu = 1; + } + } else { + if (mlayer_id == 0 && (cpi->common.immediate_output_picture || + cpi->common.implicit_output_picture)) { + ready_for_next_tu = 1; + } } size_t curr_frame_size = frame_size; @@ -3544,6 +3710,8 @@ static avm_codec_err_t encoder_encode(avm_codec_alg_priv_t *ctx, #endif // CONFIG_MIXED_LOSSLESS_ENCODE } } + // Persist TU readiness for multi-mlayer mode across encoder_encode calls. + if (multi_ml_lag) ctx->mlayer_tu_ready = ready_for_next_tu; if (is_frame_visible) { // Add the frame packet to the list of returned packets. avm_codec_cx_pkt_t pkt; @@ -3571,6 +3739,33 @@ static avm_codec_err_t encoder_encode(avm_codec_alg_priv_t *ctx, avm_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + ctx->pending_cx_data = NULL; + ctx->pending_cx_data_sz = 0; + ctx->pending_frame_count = 0; + } else if (!img && ctx->pending_cx_data_sz > 0) { + // Flush mode: the encoder ran out of frames but has accumulated + // implicit-output frames that were never followed by an + // immediate-output frame. Emit them as a packet so they are not lost. + avm_codec_cx_pkt_t pkt; + + cpi->frames_left = AVMMAX(0, cpi->frames_left - 1); + pkt.kind = AVM_CODEC_CX_FRAME_PKT; + pkt.data.frame.buf = ctx->pending_cx_data; + pkt.data.frame.sz = ctx->pending_cx_data_sz; + pkt.data.frame.partition_id = -1; + pkt.data.frame.vis_frame_size = 0; + + pkt.data.frame.pts = + ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) + + ctx->pts_offset; + pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); + if (has_no_show_keyframe) { + pkt.data.frame.flags |= AVM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT; + } + pkt.data.frame.duration = 0; + + avm_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + ctx->pending_cx_data = NULL; ctx->pending_cx_data_sz = 0; ctx->pending_frame_count = 0; @@ -3652,7 +3847,6 @@ static avm_codec_err_t ctrl_get_new_frame_image(avm_codec_alg_priv_t *ctx, if (new_img != NULL) { YV12_BUFFER_CONFIG new_frame; - if (av2_get_last_show_frame(ctx->cpi, &new_frame) == 0) { yuvconfig2image(new_img, &new_frame, NULL); return AVM_CODEC_OK; @@ -4704,6 +4898,15 @@ static avm_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV2E_SET_MONOTONIC_OUTPUT_ORDER, ctrl_set_monotonic_output_order }, { AV2E_SET_FORCE_DEFERRED_FRAMES_FOR_RAS_TEST, ctrl_set_force_deferred_frames_for_ras_test }, + { AVME_SET_XLAYER_ID, ctrl_set_xlayer_id }, + { AV2E_SET_MLAYER_DEPENDENCY_PRESENT, ctrl_set_mlayer_dependency_present }, + { AV2E_SET_MLAYER_DEPENDENCY_MAP, ctrl_set_mlayer_dependency_map }, + { AV2E_SET_INTRA_ONLY_FWD_KF, ctrl_set_intra_only_fwd_kf }, + { AV2E_SET_MLAYER_COLOR_PRIMARIES, ctrl_set_mlayer_color_primaries }, + { AV2E_SET_MLAYER_TRANSFER_CHARACTERISTICS, + ctrl_set_mlayer_transfer_characteristics }, + { AV2E_SET_MLAYER_MATRIX_COEFFICIENTS, ctrl_set_mlayer_matrix_coefficients }, + { AV2E_SET_MLAYER_COLOR_RANGE, ctrl_set_mlayer_color_range }, // Getters { AVME_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff --git a/av2/av2_dx_iface.c b/av2/av2_dx_iface.c index 7951f1c727..083d812e54 100644 --- a/av2/av2_dx_iface.c +++ b/av2/av2_dx_iface.c @@ -1163,6 +1163,7 @@ static avm_image_t *add_grain_if_needed(avm_codec_alg_priv_t *ctx, grain_img->mlayer_id = img->mlayer_id; grain_img->xlayer_id = img->xlayer_id; grain_img->stream_id = img->stream_id; + grain_img->display_order_hint = img->display_order_hint; img->metadata = NULL; if (av2_add_film_grain(grain_params, img, grain_img)) { pool->release_fb_cb(pool->cb_priv, fb); @@ -1247,6 +1248,7 @@ static avm_image_t *decoder_get_frame_(avm_codec_alg_priv_t *ctx, img->mlayer_id = output_frame_buf->mlayer_id; img->xlayer_id = output_frame_buf->xlayer_id; img->stream_id = output_frame_buf->stream_id; + img->display_order_hint = output_frame_buf->display_order_hint; if (pbi->skip_film_grain) grain_params->apply_grain = 0; avm_image_t *res = @@ -2007,6 +2009,196 @@ static avm_codec_err_t ctrl_set_row_mt(avm_codec_alg_priv_t *ctx, return AVM_CODEC_OK; } +static avm_codec_err_t ctrl_get_lcr_info(avm_codec_alg_priv_t *ctx, + va_list args) { + avm_lcr_info_t *const info = va_arg(args, avm_lcr_info_t *); + if (!info) return AVM_CODEC_INVALID_PARAM; + + memset(info, 0, sizeof(*info)); + + if (!ctx->frame_worker) return AVM_CODEC_ERROR; + + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV2Decoder *const pbi = frame_worker_data->pbi; + + // Try global LCR first (stored at xlayer_id = GLOBAL_XLAYER_ID = 31) + const struct GlobalLayerConfigurationRecord *glcr = NULL; + for (int lcr_idx = 0; lcr_idx < MAX_NUM_LCR; lcr_idx++) { + const struct LayerConfigurationRecord *lcr = + &pbi->lcr_list[GLOBAL_XLAYER_ID][lcr_idx]; + if (lcr->valid && lcr->is_global) { + glcr = &lcr->global_lcr; + break; + } + } + + if (glcr) { + info->num_xlayers = glcr->LcrMaxNumXLayerCount; + for (int i = 0; i < info->num_xlayers && i < 31; i++) { + avm_xlayer_layer_info_t *xl = &info->xlayers[i]; + xl->xlayer_id = glcr->LcrXLayerID[i]; + const struct LCRXLayerInfo *xli = &glcr->xlayer_info[i]; + xl->max_width = xli->rep_params.lcr_max_pic_width; + xl->max_height = xli->rep_params.lcr_max_pic_height; + if (xli->lcr_embedded_layer_info_present_flag) { + const struct EmbeddedLayerInfo *ml = &xli->mlayer_params; + xl->num_mlayers = ml->MLayerCount; + // Use mlayer 0 for layer_type/auxiliary_type/view_type + xl->layer_type = ml->lcr_layer_type[0]; + xl->auxiliary_type = + (ml->lcr_layer_type[0] == 1) ? ml->lcr_auxiliary_type[0] : -1; + xl->view_type = ml->lcr_view_type[0]; + } else { + xl->num_mlayers = 0; + xl->layer_type = 0; + xl->auxiliary_type = -1; + xl->view_type = 0; + } + } + return AVM_CODEC_OK; + } + + // Fallback: assemble from local LCRs per xlayer + int count = 0; + for (int xlid = 0; xlid < GLOBAL_XLAYER_ID && count < 31; xlid++) { + for (int lcr_idx = 0; lcr_idx < MAX_NUM_LCR; lcr_idx++) { + const struct LayerConfigurationRecord *lcr = + &pbi->lcr_list[xlid][lcr_idx]; + if (lcr->valid && !lcr->is_global) { + avm_xlayer_layer_info_t *xl = &info->xlayers[count]; + xl->xlayer_id = xlid; + const struct LCRXLayerInfo *xli = &lcr->local_lcr.xlayer_info; + xl->max_width = xli->rep_params.lcr_max_pic_width; + xl->max_height = xli->rep_params.lcr_max_pic_height; + if (xli->lcr_embedded_layer_info_present_flag) { + const struct EmbeddedLayerInfo *ml = &xli->mlayer_params; + xl->num_mlayers = ml->MLayerCount; + xl->layer_type = ml->lcr_layer_type[0]; + xl->auxiliary_type = + (ml->lcr_layer_type[0] == 1) ? ml->lcr_auxiliary_type[0] : -1; + xl->view_type = ml->lcr_view_type[0]; + } else { + xl->num_mlayers = 0; + xl->layer_type = 0; + xl->auxiliary_type = -1; + xl->view_type = 0; + } + count++; + break; // found LCR for this xlayer, move to next + } + } + } + info->num_xlayers = count; + + return (count > 0) ? AVM_CODEC_OK : AVM_CODEC_ERROR; +} + +static avm_codec_err_t ctrl_get_atlas_info(avm_codec_alg_priv_t *ctx, + va_list args) { + avm_atlas_info_t *const info = va_arg(args, avm_atlas_info_t *); + if (!info) return AVM_CODEC_INVALID_PARAM; + + memset(info, 0, sizeof(*info)); + + if (!ctx->frame_worker) return AVM_CODEC_ERROR; + + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV2Decoder *const pbi = frame_worker_data->pbi; + + // Scan atlas_list for the first valid entry + for (int xlid = 0; xlid < MAX_NUM_XLAYERS; xlid++) { + for (int seg_idx = 0; seg_idx < MAX_NUM_ATLAS_SEG_ID; seg_idx++) { + const struct AtlasSegmentInfo *asi = &pbi->atlas_list[xlid][seg_idx]; + if (!asi->valid) continue; + + if (asi->atlas_segment_mode_idc == ENHANCED_ATLAS) { + // Enhanced atlas: dimensions and segments from region info + mapping + const struct AtlasRegionInfo *reg = &asi->ats_reg_params; + const struct AtlasRegionToSegmentMapping *rsm = &asi->ats_reg_seg_map; + info->atlas_width = reg->AtlasWidth; + info->atlas_height = reg->AtlasHeight; + info->num_segments = rsm->ats_num_atlas_segments_minus_1 + 1; + // For enhanced atlas, derive segment positions from region mapping + for (int s = 0; s < info->num_segments && s < 256; s++) { + // Compute segment position from top-left region column/row + int col = rsm->ats_top_left_region_column[s]; + int row = rsm->ats_top_left_region_row[s]; + int seg_x = 0, seg_y = 0; + int seg_w = 0, seg_h = 0; + for (int c = 0; c < col; c++) + seg_x += reg->ats_column_width_minus_1[c] + 1; + for (int r = 0; r < row; r++) + seg_y += reg->ats_row_height_minus_1[r] + 1; + int br_col = rsm->ats_bottom_right_region_column[s]; + int br_row = rsm->ats_bottom_right_region_row[s]; + for (int c = col; c <= br_col; c++) + seg_w += reg->ats_column_width_minus_1[c] + 1; + for (int r = row; r <= br_row; r++) + seg_h += reg->ats_row_height_minus_1[r] + 1; + // Use label segment info for xlayer_id mapping + const struct AtlasLabelSegmentInfo *lsi = &asi->ats_label_seg; + int seg_label_id = lsi->ats_signalled_atlas_segment_ids_flag + ? lsi->AtlasSegmentIndexToID[s] + : s; + info->segments[s].xlayer_id = seg_label_id; + info->segments[s].pos_x = seg_x; + info->segments[s].pos_y = seg_y; + info->segments[s].width = seg_w; + info->segments[s].height = seg_h; + } + return AVM_CODEC_OK; + } else if (asi->atlas_segment_mode_idc == BASIC_ATLAS || + asi->atlas_segment_mode_idc == MULTISTREAM_ATLAS || + asi->atlas_segment_mode_idc == MULTISTREAM_ALPHA_ATLAS) { + // Basic/multistream atlas: dimensions and segments from basic info + const struct AtlasBasicInfo *abi = asi->ats_basic_info; + if (!abi) abi = &asi->ats_basic_info_s; + if (!abi || (abi->AtlasWidth == 0 && abi->AtlasHeight == 0)) continue; + info->atlas_width = abi->AtlasWidth; + info->atlas_height = abi->AtlasHeight; + info->num_segments = abi->ats_num_atlas_segments_minus_1 + 1; + for (int s = 0; s < info->num_segments && s < 256; s++) { + info->segments[s].xlayer_id = + abi->ats_stream_id_present ? abi->ats_input_stream_id[s] : s; + info->segments[s].pos_x = abi->ats_segment_top_left_pos_x[s]; + info->segments[s].pos_y = abi->ats_segment_top_left_pos_y[s]; + info->segments[s].width = abi->ats_segment_width[s]; + info->segments[s].height = abi->ats_segment_height[s]; + } + return AVM_CODEC_OK; + } else if (asi->atlas_segment_mode_idc == SINGLE_ATLAS) { + // Single atlas: single segment, dimensions from nominal_width/height + info->atlas_width = asi->ats_nominal_width_minus1 + 1; + info->atlas_height = asi->ats_nominal_height_minus1 + 1; + info->num_segments = 1; + info->segments[0].xlayer_id = xlid; + info->segments[0].pos_x = 0; + info->segments[0].pos_y = 0; + info->segments[0].width = info->atlas_width; + info->segments[0].height = info->atlas_height; + return AVM_CODEC_OK; + } + } + } + + return AVM_CODEC_ERROR; +} + +static avm_codec_err_t ctrl_get_monotonic_output_order( + avm_codec_alg_priv_t *ctx, va_list args) { + unsigned int *const val = va_arg(args, unsigned int *); + if (!val) return AVM_CODEC_INVALID_PARAM; + if (!ctx->frame_worker) return AVM_CODEC_ERROR; + + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV2Decoder *const pbi = frame_worker_data->pbi; + *val = pbi->common.seq_params.monotonic_output_order_flag; + return AVM_CODEC_OK; +} + static avm_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { AV2_COPY_REFERENCE, ctrl_copy_reference }, @@ -2051,6 +2243,9 @@ static avm_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { AVMD_GET_SHOW_EXISTING_FRAME_FLAG, ctrl_get_show_existing_frame_flag }, { AVMD_GET_S_FRAME_INFO, ctrl_get_s_frame_info }, { AVMD_GET_FRAME_INFO, ctrl_get_dec_frame_info }, + { AV2D_GET_LCR_INFO, ctrl_get_lcr_info }, + { AV2D_GET_ATLAS_INFO, ctrl_get_atlas_info }, + { AV2D_GET_MONOTONIC_OUTPUT_ORDER, ctrl_get_monotonic_output_order }, CTRL_MAP_END, }; diff --git a/av2/common/av2_common_int.h b/av2/common/av2_common_int.h index 83394d3efd..e00ea641ce 100644 --- a/av2/common/av2_common_int.h +++ b/av2/common/av2_common_int.h @@ -3004,6 +3004,13 @@ typedef struct AV2Common { * Initialized to -1 (unset). */ int olk_refresh_frame_flags[MAX_NUM_MLAYERS]; + /*! + * Refresh frame flags of a hidden intra forward keyframe (intra_only_fwd_kf) + * per layer. Used by the encoder to protect the hidden intra's DPB slot + * from being overwritten by subsequent pyramid frames. Initialized to -1 + * (unset). Not used by the decoder. + */ + int fwd_intra_refresh_frame_flags[MAX_NUM_MLAYERS]; /*! * Accumulated refresh_frame_flags of regular VCL OBUs co-signalled with an * OLK in the same temporal unit, per mlayer. Initialized to -1 (unset). @@ -3342,6 +3349,16 @@ static INLINE int is_mlayer_transitively_dependent( return 0; } +// Returns true if mlayer `layer_id` does not depend on any other mlayer, +// i.e. mlayer_dependency_map[layer_id][j] == 0 for all j != layer_id. +static INLINE int is_mlayer_independent(const SequenceHeader *const seq, + const int layer_id) { + for (int j = 0; j <= seq->max_mlayer_id; j++) { + if (j != layer_id && seq->mlayer_dependency_map[layer_id][j] != 0) return 0; + } + return 1; +} + static INLINE void get_secondary_reference_frame_idx(const AV2_COMMON *const cm, int *ref_frame_used, int *secondary_map_idx) { diff --git a/av2/decoder/decodeframe.c b/av2/decoder/decodeframe.c index 404d59fb6e..01306f51c2 100644 --- a/av2/decoder/decodeframe.c +++ b/av2/decoder/decodeframe.c @@ -7446,15 +7446,16 @@ static void activate_layer_configuration_record(AV2Decoder *pbi, // so that embedded layer info can fall back to it. if (!lcr->is_global) { int global_id = lcr->local_lcr.lcr_global_id; - if (global_id != LCR_ID_UNSPECIFIED) { - LayerConfigurationRecord *parent_glcr = - &pbi->lcr_list[GLOBAL_XLAYER_ID][global_id]; - if (parent_glcr->valid && parent_glcr->is_global) { - cm->global_lcr_params = *parent_glcr; - // Conformance: when a local LCR is present and its parent global LCR - // has xlayer_info for the same extended layer, the local LCR's - // xlayer_info shall be the same as the global LCR's xlayer_info. - const GlobalLayerConfigurationRecord *glcr = &parent_glcr->global_lcr; + LayerConfigurationRecord *parent_glcr = + &pbi->lcr_list[GLOBAL_XLAYER_ID][global_id]; + if (parent_glcr->valid && parent_glcr->is_global) { + cm->global_lcr_params = *parent_glcr; + // Conformance: when a local LCR is present and its parent global LCR + // has xlayer_info for the same extended layer, the local LCR's + // xlayer_info shall be the same as the global LCR's xlayer_info. + // This check only applies when the global LCR carries payload data. + const GlobalLayerConfigurationRecord *glcr = &parent_glcr->global_lcr; + if (glcr->lcr_global_payload_present_flag) { for (int i = 0; i < glcr->LcrMaxNumXLayerCount; i++) { if (glcr->LcrXLayerID[i] == lcr->xlayer_id) { if (memcmp(&lcr->local_lcr.xlayer_info, &glcr->xlayer_info[i], diff --git a/av2/decoder/obu.c b/av2/decoder/obu.c index 66ed285e97..95df5a297c 100644 --- a/av2/decoder/obu.c +++ b/av2/decoder/obu.c @@ -2687,6 +2687,7 @@ int avm_decode_frame_from_obus(struct AV2Decoder *pbi, const uint8_t *data, pbi->seen_frame_header = 0; pbi->next_start_tile = 0; pbi->seen_vcl_obu_in_this_tu = 0; + pbi->this_is_first_vcl_obu_in_tu = 0; pbi->doh_tu_order_hint_bits_set = 0; for (int i = 0; i < NUM_CUSTOM_QMS; i++) pbi->qm_protected[i] = 0; @@ -2700,6 +2701,7 @@ int avm_decode_frame_from_obus(struct AV2Decoder *pbi, const uint8_t *data, pbi->seen_frame_header = 0; pbi->next_start_tile = 0; pbi->seen_vcl_obu_in_this_tu = 0; + pbi->this_is_first_vcl_obu_in_tu = 0; pbi->doh_tu_order_hint_bits_set = 0; for (int i = 0; i < NUM_CUSTOM_QMS; i++) pbi->qm_protected[i] = 0; } diff --git a/av2/encoder/bitstream.c b/av2/encoder/bitstream.c index 558049d3df..f5f94ea752 100644 --- a/av2/encoder/bitstream.c +++ b/av2/encoder/bitstream.c @@ -5368,7 +5368,8 @@ static AVM_INLINE void write_uncompressed_header( } } - if (obu_type == OBU_OPEN_LOOP_KEY) { + if (obu_type == OBU_OPEN_LOOP_KEY || av2_is_olk_forward_keyframe(cpi)) { + // OLK (non-monotonic open GOP): set OLK state. cpi->olk_encountered = 1; cm->last_olk_disp_order_hint = cm->current_frame.display_order_hint; cm->last_olk_order_hint = cm->current_frame.order_hint; @@ -5377,6 +5378,13 @@ static AVM_INLINE void write_uncompressed_header( // In this encoder, the OLK updates only one reference slot cm->olk_refresh_frame_flags[cm->mlayer_id] = current_frame->refresh_frame_flags; + } else if (av2_is_fwd_intra_keyframe(cpi)) { + // Hidden intra forward keyframe (monotonic open GOP): separate state from + // OLK. Protects the hidden intra's DPB slot but does not use OLK/leading + // frame machinery. + cpi->fwd_intra_encountered = 1; + cm->fwd_intra_refresh_frame_flags[cm->mlayer_id] = + current_frame->refresh_frame_flags; } else if (obu_type == OBU_CLOSED_LOOP_KEY || (cm->is_leading_picture == 0 && cpi->gf_group.update_type[cpi->gf_group.index] != @@ -5390,7 +5398,8 @@ static AVM_INLINE void write_uncompressed_header( cm->prev_olk_co_vcl_refresh_frame_flags[cm->mlayer_id] = INVALID_IDX; } else if (cpi->olk_encountered && cm->current_frame.display_order_hint >= cm->last_olk_disp_order_hint) { - // This is a frame within the same TU as the OLK. Cannot refresh it either. + // This is a co-VCL frame within the same TU as the OLK (non-monotonic + // only). Accumulate its refresh flags so the OLK slot set is complete. cm->olk_refresh_frame_flags[cm->mlayer_id] |= current_frame->refresh_frame_flags; } @@ -6941,6 +6950,83 @@ size_t av2_write_metadata_user_data_unregistered(AV2_COMP *const cpi, return total_bytes_written; } +// Compare two ContentInterpretation color/chroma fields for equality. +// Used to decide whether a per-mlayer CI OBU is needed or if inheritance +// from a dependent layer suffices. +static int ci_params_equal(const ContentInterpretation *a, + const ContentInterpretation *b) { + if (a->ci_color_description_present_flag != + b->ci_color_description_present_flag) + return 0; + if (a->ci_color_description_present_flag) { + if (a->color_info.color_description_idc != + b->color_info.color_description_idc) + return 0; + if (a->color_info.color_primaries != b->color_info.color_primaries) + return 0; + if (a->color_info.transfer_characteristics != + b->color_info.transfer_characteristics) + return 0; + if (a->color_info.matrix_coefficients != b->color_info.matrix_coefficients) + return 0; + if (a->color_info.full_range_flag != b->color_info.full_range_flag) + return 0; + } + if (a->ci_chroma_sample_position_present_flag != + b->ci_chroma_sample_position_present_flag) + return 0; + if (a->ci_chroma_sample_position_present_flag) { + if (a->ci_chroma_sample_position[0] != b->ci_chroma_sample_position[0]) + return 0; + if (a->ci_chroma_sample_position[1] != b->ci_chroma_sample_position[1]) + return 0; + } + if (a->ci_aspect_ratio_info_present_flag != + b->ci_aspect_ratio_info_present_flag) + return 0; + if (a->ci_timing_info_present_flag != b->ci_timing_info_present_flag) + return 0; + return 1; +} + +// Write a CI OBU for the current mlayer if it has distinct CI. +// Returns the number of bytes written (0 if skipped). Sets *err on failure. +static size_t write_ci_obu_for_mlayer(AV2_COMP *const cpi, uint8_t *data, + avm_codec_err_t *err) { + AV2_COMMON *const cm = &cpi->common; + *err = AVM_CODEC_OK; + + // Skip if CI isn't needed globally + if (!cpi->write_ci_obu_flag) return 0; + + // For mlayer > 0, skip if CI is identical to the first dependent layer + // (decoder inherits automatically) + if (cm->mlayer_id > 0) { + for (int ref = 0; ref < cm->mlayer_id; ref++) { + if (cm->seq_params.mlayer_dependency_map[cm->mlayer_id][ref]) { + if (ci_params_equal(&cm->ci_params_per_layer[cm->mlayer_id], + &cm->ci_params_per_layer[ref])) + return 0; + break; + } + } + } + + const int obu_layer_ci = (cm->mlayer_id << 5) | cm->xlayer_id; + uint32_t obu_header_size = + av2_write_obu_header(OBU_CONTENT_INTERPRETATION, 0, obu_layer_ci, data); + uint32_t obu_payload_size = av2_write_content_interpretation_obu( + &cm->ci_params_per_layer[cm->mlayer_id], data + obu_header_size); + size_t length_field_size = + obu_memmove(obu_header_size, obu_payload_size, data); + if (av2_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AVM_CODEC_OK) { + *err = AVM_CODEC_ERROR; + return 0; + } + return obu_header_size + obu_payload_size + length_field_size; +} + // This function actually writes to the bistream. The av2_pack_bitstream() // function is a thin wrapper around this function. static int av2_pack_bitstream_internal(AV2_COMP *const cpi, uint8_t *dst, @@ -6958,6 +7044,19 @@ static int av2_pack_bitstream_internal(AV2_COMP *const cpi, uint8_t *dst, obu_mlayer << 5 | obu_xlayer; // obu_layer byte (mlayer (3-bit) | xlayer (5-bit)) + // Track which higher mlayers need CI OBUs at this RAP. ci_rap_tu is used + // as a bitmask: bit i is set when mlayer i still needs its CI OBU written. + // Only set at CLK/OLK for mlayer 0; each higher mlayer clears its bit once + // its CI OBU is emitted. This survives the intervening mlayer 0 non-CLK + // frames that are encoded before mlayer 1+ starts (due to lag encoding). + if (cm->mlayer_id == 0 && + (cm->current_frame.cm_obu_type == OBU_CLOSED_LOOP_KEY || + cm->current_frame.cm_obu_type == OBU_OPEN_LOOP_KEY)) { + // Set bits for mlayers 1..max_mlayer_id (mlayer 0 is handled inline) + const int max_ml = cm->seq_params.max_mlayer_id; + cpi->ci_rap_tu = max_ml > 0 ? (((1 << (max_ml + 1)) - 1) & ~1) : 0; + } + bool add_new_user_qm = false; // If no non-zero delta_q has been used, reset delta_q_present_flag if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) { @@ -7056,19 +7155,11 @@ static int av2_pack_bitstream_internal(AV2_COMP *const cpi, uint8_t *dst, if (cm->current_frame.cm_obu_type == OBU_CLOSED_LOOP_KEY) { size_t length_field_size; - if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf && - cpi->write_ci_obu_flag) { - obu_header_size = - av2_write_obu_header(OBU_CONTENT_INTERPRETATION, 0, 0, data); - obu_payload_size = av2_write_content_interpretation_obu( - &cm->ci_params_encoder, data + obu_header_size); - size_t length_field_size1 = - obu_memmove(obu_header_size, obu_payload_size, data); - if (av2_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != - AVM_CODEC_OK) { - return AVM_CODEC_ERROR; - } - data += obu_header_size + obu_payload_size + length_field_size1; + if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) { + avm_codec_err_t ci_err; + size_t ci_bytes = write_ci_obu_for_mlayer(cpi, data, &ci_err); + if (ci_err != AVM_CODEC_OK) return AVM_CODEC_ERROR; + data += ci_bytes; } if (cm->cur_mfh_id != 0) { @@ -7133,6 +7224,29 @@ static int av2_pack_bitstream_internal(AV2_COMP *const cpi, uint8_t *dst, } } } + + // Write CI OBU at OLK (random access point) for conformance. + // The SH is not written at OLK (noted above), but CI must be present at + // all RAPs so decoders can recover color interpretation after random access. + if (cm->current_frame.cm_obu_type == OBU_OPEN_LOOP_KEY) { + avm_codec_err_t ci_err; + size_t ci_bytes = write_ci_obu_for_mlayer(cpi, data, &ci_err); + if (ci_err != AVM_CODEC_OK) return AVM_CODEC_ERROR; + data += ci_bytes; + } + + // Write CI OBU for mlayer > 0 when in a RAP TU. + // Higher mlayers are encoded as REGULAR_TILE_GROUP (not CLK/OLK), so they + // don't enter the blocks above. ci_rap_tu bitmask was set by mlayer 0's + // CLK/OLK; each bit is cleared once that mlayer's CI has been emitted. + if (cm->mlayer_id > 0 && (cpi->ci_rap_tu & (1 << cm->mlayer_id))) { + avm_codec_err_t ci_err; + size_t ci_bytes = write_ci_obu_for_mlayer(cpi, data, &ci_err); + if (ci_err != AVM_CODEC_OK) return AVM_CODEC_ERROR; + data += ci_bytes; + cpi->ci_rap_tu &= ~(1 << cm->mlayer_id); + } + if (add_new_user_qm && !cpi->obu_is_written) { assert(cpi->total_signalled_qmobu_count > 0); obu_header_size = av2_write_obu_header(OBU_QUANTIZATION_MATRIX, @@ -7154,7 +7268,7 @@ static int av2_pack_bitstream_internal(AV2_COMP *const cpi, uint8_t *dst, struct film_grain_model fgm_current; set_film_grain_model(cpi, &fgm_current); int use_existing_fgm = -1; - if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) { + if (cm->current_frame.frame_type == KEY_FRAME && !cpi->is_fwd_kf) { cpi->written_fgm_num = 0; // clear the list, it is increased before uncompressed_header() fgm_current.fgm_id = 0; @@ -7313,7 +7427,7 @@ static int av2_pack_bitstream_internal(AV2_COMP *const cpi, uint8_t *dst, OBU_TYPE obu_type = cm->is_leading_picture == 1 ? OBU_LEADING_TILE_GROUP : OBU_REGULAR_TILE_GROUP; if (cm->current_frame.frame_type == KEY_FRAME) - obu_type = cpi->no_show_fwd_kf ? OBU_OPEN_LOOP_KEY : OBU_CLOSED_LOOP_KEY; + obu_type = cpi->is_fwd_kf ? OBU_OPEN_LOOP_KEY : OBU_CLOSED_LOOP_KEY; if (cm->current_frame.frame_type == S_FRAME) obu_type = (cpi->is_ras_frame == 1) ? OBU_RAS_FRAME : OBU_SWITCH; @@ -7418,8 +7532,10 @@ static int av2_pack_bitstream_internal(AV2_COMP *const cpi, uint8_t *dst, int write_temporal_point_metadata = (cpi->write_ci_obu_flag && - cpi->common.ci_params_encoder.ci_timing_info_present_flag && - cpi->common.ci_params_encoder.timing_info.equal_elemental_interval == 0) + cpi->common.ci_params_per_layer[cpi->common.mlayer_id] + .ci_timing_info_present_flag && + cpi->common.ci_params_per_layer[cpi->common.mlayer_id] + .timing_info.equal_elemental_interval == 0) ? 1 : 0; if (write_temporal_point_metadata) { diff --git a/av2/encoder/encode_strategy.c b/av2/encoder/encode_strategy.c index fa786e6a87..2db7b6f4ce 100644 --- a/av2/encoder/encode_strategy.c +++ b/av2/encoder/encode_strategy.c @@ -178,45 +178,66 @@ static INLINE void update_gf_group_index(AV2_COMP *cpi) { cpi->common.number_mlayers == 1) { ++cpi->gf_group.index; } else { - // To be updated based on the (multi_layers) tests for nonzero lag. - // The current test is for fixed GOP with keyframe_filtering off. + // Multi-mlayer with lag: within each temporal unit, complete all frames + // for the current embedded layer before switching to the next. + // + // ARFs are "hidden" (batched with their overlay) when monotonic mode is + // on OR when add_sef_for_hidden_frames is on. In both cases, the hidden + // frames are grouped together with the first displayable frame in a + // single TU, and the index rewinds so each mlayer processes the same + // hidden batch. + // + // In non-monotonic mode WITHOUT SEF, ARF and INTNL_ARF frames are + // implicit output (the decoder reorders them), so each gets its own TU — + // no batching. Overlay entries still execute (popping from the lookahead + // to stay in sync) but become zero-byte FRAME_NULL_PKT via the + // forced_implicit path in av2_cx_iface.c. GF_GROUP *const gf_group = &cpi->gf_group; - if (gf_group->update_type[cpi->gf_group.index] == ARF_UPDATE || - gf_group->update_type[cpi->gf_group.index] == INTNL_ARF_UPDATE || - gf_group->update_type[cpi->gf_group.index] == KFFLT_UPDATE) { + const FRAME_UPDATE_TYPE cur_type = gf_group->update_type[gf_group->index]; + const int nonmono = !cpi->common.seq_params.monotonic_output_order_flag; + // ARFs are hidden (not implicit output) when either monotonic mode is on + // OR add_sef_for_hidden_frames is on (SEF mode forces hidden+SEF output). + const int arfs_are_hidden = + !nonmono || cpi->oxcf.ref_frm_cfg.add_sef_for_hidden_frames; + + if (arfs_are_hidden && + (cur_type == ARF_UPDATE || cur_type == INTNL_ARF_UPDATE || + cur_type == KFFLT_UPDATE)) { + // Hidden frame — advance index, stay on same mlayer. ++gf_group->index; - // Continue on the same mlayer. - if (cpi->common.mlayer_id == 0) gf_group->arf_update_counter++; - } else if (cpi->common.mlayer_id == 0 && cpi->gf_group.index > 0 && - (gf_group->update_type[cpi->gf_group.index] == LF_UPDATE || - gf_group->update_type[cpi->gf_group.index] == - FWD_KF_OVERLAY_UPDATE || - gf_group->update_type[cpi->gf_group.index] == - FWD_KF_SUCCESSOR_UPDATE) && - (gf_group->update_type[cpi->gf_group.index - 1] == ARF_UPDATE || - gf_group->update_type[cpi->gf_group.index - 1] == - INTNL_ARF_UPDATE || - gf_group->update_type[cpi->gf_group.index - 1] == - OVERLAY_UPDATE || - gf_group->update_type[cpi->gf_group.index - 1] == - INTNL_OVERLAY_UPDATE || - gf_group->update_type[cpi->gf_group.index - 1] == - KFFLT_OVERLAY_UPDATE)) { - // This willl force the next encode_call to encode ARFs followed by LF - // at the next ml layer. - gf_group->index = gf_group->index - gf_group->arf_update_counter; + gf_group->arf_update_counter++; + } else if (nonmono && cur_type == KFFLT_UPDATE) { + // Non-monotonic KFFLT without SEF: the filtered keyframe is hidden + // (not implicit output like ARF/INTNL_ARF). All frames of a given + // embedded layer must be grouped together up to the output frame before + // moving to the next layer. Advance to the KFFLT_OVERLAY (displayable) + // while staying on the same mlayer, and track the hidden frame count so + // the rewind logic below replays for the next mlayer. + ++gf_group->index; + gf_group->arf_update_counter++; + } else if ((arfs_are_hidden || gf_group->arf_update_counter > 0) && + gf_group->arf_update_counter > 0 && + (unsigned int)cpi->common.mlayer_id < + cpi->common.number_mlayers - 1) { + // End of hidden batch + displayable for current mlayer, more mlayers + // remain: rewind to start of hidden batch and advance mlayer. + gf_group->index -= gf_group->arf_update_counter; gf_group->arf_update_counter = 0; - // Go to next mlayer - cpi->common.next_mlayer_id = 1; - } else if ((unsigned int)cpi->common.mlayer_id == + cpi->common.next_mlayer_id = cpi->common.mlayer_id + 1; + } else if ((unsigned int)cpi->common.mlayer_id < cpi->common.number_mlayers - 1) { - // Every regular frame is encoded with same source up to number_mlayers. + // Not last mlayer: stay at same index, switch to next mlayer. + cpi->common.next_mlayer_id = cpi->common.mlayer_id + 1; + } else { + // Last mlayer: advance index, switch back to ml=0. ++gf_group->index; - // Go back to mlayer 0 + gf_group->arf_update_counter = 0; cpi->common.next_mlayer_id = 0; - } else { - // Go to next mlayer - cpi->common.next_mlayer_id = 1; + + // Non-monotonic: overlay entries are NOT skipped. They pop from + // the lookahead (keeping it in sync) and become zero-byte + // FRAME_NULL_PKT via the forced_implicit path when the underlying + // ARF/INTNL was implicit output. } } } @@ -756,9 +777,9 @@ int av2_get_refresh_frame_flags( AV2_COMP *const cpi, const EncodeFrameParams *const frame_params, FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) { - // Shown key-frames overwrite all reference slots + // Shown key-frames overwrite all reference slots (CLK only, not OLK) if (av2_is_shown_keyframe(cpi, frame_params->frame_type) && - cpi->common.seq_params.max_mlayer_id == 0 && !cpi->no_show_fwd_kf) { + cpi->common.seq_params.max_mlayer_id == 0 && !cpi->is_fwd_kf) { return (1 << cpi->common.seq_params.ref_frames) - 1; } @@ -784,23 +805,30 @@ int av2_get_refresh_frame_flags( } } } - // For fwd kf, only refresh one buffer. The other buffers will be refreshed - // on the first regular TU it encounters after the OLK TU. - if (cpi->no_show_fwd_kf) { - int refresh_idx = -1; - for (int i = 0; i < cm->seq_params.ref_frames; ++i) { - if ((refresh_frame_flags >> i) & 1) { - // Skip slots containing implicit-output frames that have not - // been output yet and whose DOH is at least the current - // frame's DOH. (DOH requirement) - if (cm->ref_frame_map[i] != NULL && - cm->ref_frame_map[i]->implicit_output_picture && - !cm->ref_frame_map[i]->frame_output_done && - (int)cm->ref_frame_map[i]->display_order_hint >= cur_disp_order) { - continue; + // For fwd kf (displayed or hidden OLK), only refresh one buffer. + // The other buffers will be refreshed on the first regular TU it + // encounters after the OLK TU. + if (cpi->is_fwd_kf) { + // With multiple embedded layers, each mlayer's forward keyframe must + // refresh a different DPB slot. Use the mlayer-aware slot allocator and + // protect slots already claimed by earlier mlayers. + int fwd_kf_flags_to_keep = 0; + const int is_fwd_intra = cpi->oxcf.kf_cfg.intra_only_fwd_kf; + for (int ml = 0; ml < cm->mlayer_id; ml++) { + int flags = is_fwd_intra ? cm->fwd_intra_refresh_frame_flags[ml] + : cm->olk_refresh_frame_flags[ml]; + if (flags != -1) fwd_kf_flags_to_keep |= flags; + } + int refresh_idx = get_free_ref_map_index_multi_layer( + ref_frame_map_pairs, cm->seq_params.ref_frames, fwd_kf_flags_to_keep, + cm->mlayer_id); + if (refresh_idx == INVALID_IDX) { + // Fallback: pick the first available bit in refresh_frame_flags + for (int i = 0; i < cm->seq_params.ref_frames; ++i) { + if ((refresh_frame_flags >> i) & 1) { + refresh_idx = i; + break; } - refresh_idx = i; - break; } } assert(refresh_idx >= 0); @@ -842,6 +870,14 @@ int av2_get_refresh_frame_flags( olk_flags_to_keep |= cpi->common.olk_refresh_frame_flags[layer]; } } + // Also protect the hidden intra forward keyframe's DPB slot. + if (cpi->fwd_intra_encountered) { + for (int layer = 0; layer <= cpi->common.seq_params.max_mlayer_id; + layer++) { + if (cpi->common.fwd_intra_refresh_frame_flags[layer] == -1) continue; + olk_flags_to_keep |= cpi->common.fwd_intra_refresh_frame_flags[layer]; + } + } // Protect ref buffer slots containing implicit-output frames with DOH // at least the current frame's DOH. (DOH requirement) @@ -970,8 +1006,8 @@ static int denoise_and_encode(AV2_COMP *const cpi, uint8_t *const dest, av2_frame_init_quantizer(cpi); av2_setup_past_independence(cm); - if (!frame_params->immediate_output_picture && cpi->no_show_fwd_kf) { - // fwd kf + if (!frame_params->immediate_output_picture && cpi->is_fwd_kf) { + // fwd kf (displayed or hidden OLK) arf_src_index = -1 * gf_group->arf_src_offset[gf_group->index]; } else if (!frame_params->immediate_output_picture) { arf_src_index = 0; @@ -1038,14 +1074,26 @@ static int denoise_and_encode(AV2_COMP *const cpi, uint8_t *const dest, cm->cur_frame->allow_direct_use = cm->allow_direct_use; + // Non-monotonic multi-mlayer: track the ARF's allow_direct_use decision. + // This persists across mlayers so the OVERLAY_UPDATE can produce null + // output even when the DPB reference has been evicted for later mlayers. + // Only applies when ARFs are truly implicit output (not SEF mode). + if (cpi->oxcf.unit_test_cfg.multi_layers_lag_test && cm->number_mlayers > 1 && + !cm->seq_params.monotonic_output_order_flag && + !cpi->oxcf.ref_frm_cfg.add_sef_for_hidden_frames && + get_frame_update_type(&cpi->gf_group) == ARF_UPDATE) { + cpi->gf_group.arf_is_implicit_output = cm->allow_direct_use; + } + // perform tpl after filtering int allow_tpl = oxcf->gf_cfg.lag_in_frames > 1 && !is_stat_generation_stage(cpi) && oxcf->algo_cfg.enable_tpl_model; - if (frame_params->frame_type == KEY_FRAME) { - // Don't do tpl for fwd key frames + if (frame_params->frame_type == KEY_FRAME || + frame_params->frame_type == INTRA_ONLY_FRAME) { + // Don't do tpl for fwd key frames or intra-only fwd frames allow_tpl = allow_tpl && !cpi->sf.tpl_sf.disable_filtered_key_tpl && - !cpi->no_show_fwd_kf; + !cpi->is_fwd_kf; } else { // Do tpl after ARF is filtered, or if no ARF, at the second frame of GF // group. @@ -1142,11 +1190,12 @@ int av2_encode_strategy(AV2_COMP *const cpi, size_t *const size, if (cpi->oxcf.ref_frm_cfg.add_sef_for_hidden_frames) { cpi->common.implicit_output_picture = 0; } - if (gf_group->update_type[gf_group->index] == FWD_KF_OVERLAY_UPDATE || - gf_group->update_type[gf_group->index] == FWD_KF_SUCCESSOR_UPDATE) { + if ((gf_group->update_type[gf_group->index] == FWD_KF_OVERLAY_UPDATE || + gf_group->update_type[gf_group->index] == FWD_KF_SUCCESSOR_UPDATE) && + !cpi->oxcf.kf_cfg.intra_only_fwd_kf) { // These have to use implicit output since they need to be // coded_output_picture OBUs, to be put together with a hidden OLK obu in - // the same TU. + // the same TU. Not applicable for intra_only_fwd_kf. cpi->common.implicit_output_picture = 1; } @@ -1253,22 +1302,27 @@ int av2_encode_strategy(AV2_COMP *const cpi, size_t *const size, if (cpi->oxcf.ref_frm_cfg.add_sef_for_hidden_frames) { cm->implicit_output_picture = 0; } - if (gf_group->update_type[gf_group->index] == FWD_KF_OVERLAY_UPDATE || - gf_group->update_type[gf_group->index] == FWD_KF_SUCCESSOR_UPDATE) { + if ((gf_group->update_type[gf_group->index] == FWD_KF_OVERLAY_UPDATE || + gf_group->update_type[gf_group->index] == FWD_KF_SUCCESSOR_UPDATE) && + !cpi->oxcf.kf_cfg.intra_only_fwd_kf) { // These have to use implicit output since they need to be // coded_output_picture OBUs, to be put together with a hidden OLK obu in - // the same TU. + // the same TU. Not applicable for intra_only_fwd_kf which uses regular + // SEF instead of OLK TU structure. cpi->common.implicit_output_picture = 1; } - if (frame_params.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) { + if (frame_params.frame_type == KEY_FRAME && !cpi->is_fwd_kf) { + // CLK: not implicit output, not direct use. cm->allow_direct_use = 0; cm->implicit_output_picture = 0; } - if (cpi->no_show_fwd_kf && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1) { + if (cpi->no_show_fwd_kf && (cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 || + cpi->oxcf.kf_cfg.intra_only_fwd_kf)) { // An overlay of the fwd kf is going to be added. The fwd kf cannot be - // directly displayed. + // directly displayed. For intra_only_fwd_kf, the hidden INTRA_ONLY_FRAME + // must always be shown via SEF, never directly. cm->allow_direct_use = 0; cm->implicit_output_picture = 0; } @@ -1297,7 +1351,9 @@ int av2_encode_strategy(AV2_COMP *const cpi, size_t *const size, cm->current_frame.display_order_hint_restricted = cur_frame_disp; cm->current_frame.pyramid_level = get_true_pyr_level( cpi->gf_group.layer_depth[cpi->gf_group.index], - cm->current_frame.frame_type == KEY_FRAME, cpi->gf_group.max_layer_depth, + frame_params.frame_type == KEY_FRAME || + frame_params.frame_type == INTRA_ONLY_FRAME, + cpi->gf_group.max_layer_depth, cpi->gf_group.update_type[cpi->gf_group.index] == KFFLT_OVERLAY_UPDATE); cm->current_frame.tlayer_id = cm->tlayer_id; @@ -1349,9 +1405,11 @@ int av2_encode_strategy(AV2_COMP *const cpi, size_t *const size, // encoded cpi->gf_state.olk_overlay_last = 1; } - int use_olk_ref_only = - cpi->gf_group.update_type[cpi->gf_group.index] == FWD_KF_OVERLAY_UPDATE || - cpi->gf_group.update_type[cpi->gf_group.index] == FWD_KF_SUCCESSOR_UPDATE; + int use_olk_ref_only = !cpi->oxcf.kf_cfg.intra_only_fwd_kf && + (cpi->gf_group.update_type[cpi->gf_group.index] == + FWD_KF_OVERLAY_UPDATE || + cpi->gf_group.update_type[cpi->gf_group.index] == + FWD_KF_SUCCESSOR_UPDATE); init_ref_map_pair(&cpi->common, cm->ref_frame_map_pairs, frame_params.frame_type == KEY_FRAME, cpi->is_ras_frame == 1, use_olk_ref_only); @@ -1507,7 +1565,7 @@ int av2_encode_strategy(AV2_COMP *const cpi, size_t *const size, // If this is a forward keyframe, mark as a show_existing_frame // TODO(bohanli): find a consistent condition for fwd keyframes - if (oxcf->kf_cfg.fwd_kf_enabled && + if (oxcf->kf_cfg.fwd_kf_enabled && !oxcf->kf_cfg.intra_only_fwd_kf && (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE || gf_group->update_type[gf_group->index] == KFFLT_OVERLAY_UPDATE) && gf_group->arf_index >= 0 && cpi->rc.frames_to_key == 0) { @@ -1529,6 +1587,10 @@ int av2_encode_strategy(AV2_COMP *const cpi, size_t *const size, KFFLT_OVERLAY_UPDATE)) || gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE; } + // In non-monotonic multi-mlayer mode, overlays for implicit-output + // frames remain as show_existing_frame. The forced_implicit path in + // av2_cx_iface.c converts them to zero-byte FRAME_NULL_PKT, keeping + // the lookahead in sync without emitting redundant OBUs. frame_params.frame_params_update_type_was_overlay &= allow_show_existing(cpi, *frame_flags); } else { diff --git a/av2/encoder/encoder.c b/av2/encoder/encoder.c index b38b0b0ec8..92ef97a1d9 100644 --- a/av2/encoder/encoder.c +++ b/av2/encoder/encoder.c @@ -903,6 +903,15 @@ static void set_content_interpreation_params(struct AV2_COMP *cpi, cpi->write_ci_obu_flag = 1; else cpi->write_ci_obu_flag = 0; + + // Propagate the base CI to all mlayer slots that haven't been explicitly + // overridden by AV2E_SET_MLAYER_COLOR_* controls. Use MAX_NUM_MLAYERS + // rather than max_mlayer_id because max_mlayer_id may not be set yet at + // init time (it is set later by av2_init_seq_coding_tools). + for (int ml = 0; ml < MAX_NUM_MLAYERS; ml++) { + if (!cpi->ci_per_layer_overridden[ml]) + cpi->common.ci_params_per_layer[ml] = *ci_params; + } } static void init_config(struct AV2_COMP *cpi, AV2EncoderConfig *oxcf) { @@ -4608,6 +4617,42 @@ static int encode_frame_to_data_rate(AV2_COMP *cpi, size_t *size, uint8_t *dest, cm->ref_frame_map[cpi->fb_idx_for_overlay] && cm->ref_frame_map[cpi->fb_idx_for_overlay]->implicit_output_picture; + // Non-monotonic multi-mlayer: when an overlay's underlying frame was + // implicit output but the reference has been evicted from the DPB (e.g., + // ml>0 in interleaved coding order), the overlay produces zero bytes. + // The source was already popped from the lookahead; we just return early. + // This handles both: + // - INTNL_OVERLAY_UPDATE (always null, since INTNL always implicit output) + // - OVERLAY_UPDATE when arf_is_implicit_output (ARF had allow_direct_use=1) + if (!forced_implicit && cpi->update_type_was_overlay && + cpi->fb_idx_for_overlay == INVALID_IDX && + !cm->seq_params.monotonic_output_order_flag && + cpi->oxcf.unit_test_cfg.multi_layers_lag_test && cm->number_mlayers > 1) { + *size = 0; + if (cm->immediate_output_picture) cpi->last_show_frame_buf = cm->cur_frame; + if (!av2_check_keyframe_overlay(cpi->gf_group.index, &cpi->gf_group, + cpi->rc.frames_since_key)) + ++current_frame->frame_number; + return AVM_CODEC_OK; + } + // Also handle OVERLAY_UPDATE with valid fb_idx but where the ARF was + // tracked as implicit output (arf_is_implicit_output persists across + // mlayers even when the DPB slot has been reused). + if (!forced_implicit && !cpi->update_type_was_overlay && + cpi->gf_group.update_type[cpi->gf_group.index] == OVERLAY_UPDATE && + cpi->gf_group.arf_is_implicit_output && + !cm->seq_params.monotonic_output_order_flag && + cpi->oxcf.unit_test_cfg.multi_layers_lag_test && cm->number_mlayers > 1) { + *size = 0; + cpi->update_type_was_overlay = 1; // signal av2_cx_iface as null overlay + cpi->gf_group.arf_is_implicit_output = 0; // consumed + if (cm->immediate_output_picture) cpi->last_show_frame_buf = cm->cur_frame; + if (!av2_check_keyframe_overlay(cpi->gf_group.index, &cpi->gf_group, + cpi->rc.frames_since_key)) + ++current_frame->frame_number; + return AVM_CODEC_OK; + } + if ((!cpi->oxcf.ref_frm_cfg.add_sef_for_hidden_frames || forced_implicit) && cpi->update_type_was_overlay && cpi->fb_idx_for_overlay != INVALID_IDX && cm->ref_frame_map[cpi->fb_idx_for_overlay]) { @@ -4661,28 +4706,38 @@ static int encode_frame_to_data_rate(AV2_COMP *cpi, size_t *size, uint8_t *dest, } if (need_sef_obu_for_hidden_frame(cpi)) { - // If this is an olk SEF, reset the buffers except for the olk. - int ref_flags_to_keep = 0; - for (int layer = 0; layer <= seq_params->max_mlayer_id; layer++) { - ref_flags_to_keep |= cm->olk_refresh_frame_flags[layer]; - } - if (cpi->olk_encountered && ref_flags_to_keep != 0 && - cpi->fb_idx_for_overlay >= 0 && - ((ref_flags_to_keep >> cpi->fb_idx_for_overlay) & 1u)) { - for (int ref_index = 0; ref_index < cm->seq_params.ref_frames; - ref_index++) { - if (!((ref_flags_to_keep >> ref_index) & 1u) && - (cm->ref_frame_map[ref_index] == NULL || - cm->ref_frame_map[ref_index]->long_term_id == -1)) { - if (cm->ref_frame_map[ref_index] != NULL) { - --cm->ref_frame_map[ref_index]->ref_count; - cm->ref_frame_map[ref_index] = NULL; + if (cpi->fwd_intra_encountered) { + // Monotonic hidden intra forward keyframe: output via SEF. + // No DPB clearing — pyramid frames are regular and still referenced. + cpi->gf_state.fwd_intra_overlay_last = 1; + cpi->fwd_intra_encountered = 0; + cm->fwd_intra_refresh_frame_flags[cm->mlayer_id] = -1; + cpi->is_olk_overlay = 0; + } else if (cpi->olk_encountered) { + // Non-monotonic OLK SEF: reset buffers except for the OLK. + int ref_flags_to_keep = 0; + for (int layer = 0; layer <= seq_params->max_mlayer_id; layer++) { + ref_flags_to_keep |= cm->olk_refresh_frame_flags[layer]; + } + if (ref_flags_to_keep != 0 && cpi->fb_idx_for_overlay >= 0 && + ((ref_flags_to_keep >> cpi->fb_idx_for_overlay) & 1u)) { + for (int ref_index = 0; ref_index < cm->seq_params.ref_frames; + ref_index++) { + if (!((ref_flags_to_keep >> ref_index) & 1u) && + (cm->ref_frame_map[ref_index] == NULL || + cm->ref_frame_map[ref_index]->long_term_id == -1)) { + if (cm->ref_frame_map[ref_index] != NULL) { + --cm->ref_frame_map[ref_index]->ref_count; + cm->ref_frame_map[ref_index] = NULL; + } } } + cpi->is_olk_overlay = 1; + cpi->gf_state.olk_overlay_last = 1; + cpi->olk_encountered = 0; + } else { + cpi->is_olk_overlay = 0; } - cpi->is_olk_overlay = 1; - cpi->gf_state.olk_overlay_last = 1; - cpi->olk_encountered = 0; } else { cpi->is_olk_overlay = 0; } @@ -4697,7 +4752,10 @@ static int encode_frame_to_data_rate(AV2_COMP *cpi, size_t *size, uint8_t *dest, cpi->seq_params_locked = 1; cm->sef_ref_fb_idx = cpi->fb_idx_for_overlay; - if (cm->last_olk_disp_order_hint > cm->current_frame.display_order_hint) { + if (cm->last_olk_disp_order_hint > cm->current_frame.display_order_hint && + cpi->is_olk_overlay) { + // Non-monotonic OLK: SEF for a frame whose display order precedes the + // OLK is a leading SEF. cm->is_leading_picture = 1; } else { cm->is_leading_picture = 0; @@ -4784,17 +4842,28 @@ static int encode_frame_to_data_rate(AV2_COMP *cpi, size_t *size, uint8_t *dest, return AVM_CODEC_OK; } - if (current_frame->frame_type == KEY_FRAME) { + if (current_frame->frame_type == KEY_FRAME || + av2_is_olk_forward_keyframe(cpi)) { + // CLK or OLK (displayed or hidden). cm->is_leading_picture = -1; - if (cpi->no_show_fwd_kf) { + if (cpi->is_fwd_kf) { + // OLK (displayed or hidden): subsequent frames before the OLK in display + // order are leading pictures. cpi->olk_encountered = 1; cm->last_olk_order_hint = cm->current_frame.order_hint; cm->last_olk_disp_order_hint = cm->current_frame.display_order_hint; } else { cpi->olk_encountered = 0; } + } else if (av2_is_fwd_intra_keyframe(cpi)) { + // Hidden intra forward keyframe (monotonic open GOP). + // No leading-picture state — all frames remain regular in monotonic mode. + cm->is_leading_picture = 0; + cpi->fwd_intra_encountered = 1; } else { if (cm->last_olk_disp_order_hint > cm->current_frame.display_order_hint) { + // Non-monotonic OLK: frames with display order before the OLK are + // leading pictures. cm->is_leading_picture = 1; } else { cm->is_leading_picture = 0; @@ -5466,6 +5535,7 @@ int av2_get_compressed_data(AV2_COMP *cpi, unsigned int *frame_flags, // Initialize fields related to forward keyframes cpi->no_show_fwd_kf = 0; + cpi->is_fwd_kf = 0; check_ref_count_status_enc(cpi); if (assign_cur_frame_new_fb(cm) == NULL) return AVM_CODEC_ERROR; diff --git a/av2/encoder/encoder.h b/av2/encoder/encoder.h index 61d623e679..c9fe419ad0 100644 --- a/av2/encoder/encoder.h +++ b/av2/encoder/encoder.h @@ -433,6 +433,14 @@ typedef struct { */ bool fwd_kf_enabled; + /*! + * When true and fwd_kf_enabled is true, the forward keyframe is coded as + * INTRA_ONLY_FRAME instead of KEY_FRAME. This enables open GOP with + * monotonic output: the hidden intra frame is shown via SEF, and reference + * buffers are not reset (allowing inter-prediction across the boundary). + */ + bool intra_only_fwd_kf; + /*! * Indicates if S-Frames should be enabled for the sequence. */ @@ -1062,6 +1070,11 @@ typedef struct { const SubGOPCfg *subgop_cfg; // Number of arf updates before a displayeed frame. int arf_update_counter; + // Non-monotonic multi-mlayer: 1 if the ARF in this GF group had + // allow_direct_use=1 (implicit output). Persists across mlayers so + // the OVERLAY_UPDATE can produce null output even when the reference + // has been evicted from the DPB for later mlayers. + int arf_is_implicit_output; /*!\endcond */ } GF_GROUP; /*!\cond */ @@ -1071,6 +1084,8 @@ typedef struct { int arf_gf_boost_lst; // Track if the last frame in a GOP is a olk overlay int olk_overlay_last; + // Track if the last frame in a GOP is a fwd intra SEF output + int fwd_intra_overlay_last; } GF_STATE; typedef struct { @@ -2379,15 +2394,30 @@ typedef struct AV2_COMP { struct lookahead_ctx *lookahead; /*! - * When set, this flag indicates that the current frame is a forward keyframe. + * When set, this flag indicates that the current frame is a hidden forward + * keyframe (needs kf_filt overlay or SEF to be shown). */ int no_show_fwd_kf; + /*! + * When set, this flag indicates that the current ARF is at the KF boundary + * and should be coded as a keyframe (OLK or INTRA_ONLY_FRAME). This is + * independent of no_show_fwd_kf: for open_leading the frame is a displayed + * OLK (is_fwd_kf=1, no_show_fwd_kf=0). + */ + int is_fwd_kf; /*! * Indicates an OLK obu is encountered in any layer * It is initialized as 0 and set 1 when the first olk is decoded and set 0 * when the first regular frame or the first CLK after the olk is decoded. */ int olk_encountered; + /*! + * Indicates a hidden intra forward keyframe (intra_only_fwd_kf) has been + * coded. Set when the hidden INTRA_ONLY_FRAME is encoded and cleared when + * its SEF is output. Used to protect the hidden intra's DPB slot from + * being overwritten by subsequent pyramid frames. + */ + int fwd_intra_encountered; /*! * If true, the update type is one of overlay updates */ @@ -2928,6 +2958,17 @@ typedef struct AV2_COMP { * write ci obu */ int write_ci_obu_flag; + /*! + * Set to 1 when the current TU is a RAP (CLK or OLK at mlayer 0). + * Used to trigger CI OBU writing for mlayer > 0 in the same TU. + */ + int ci_rap_tu; + /*! + * Per-mlayer CI override flags. Set to 1 by AV2E_SET_MLAYER_COLOR_* + * controls to indicate that ci_params_per_layer[ml] was explicitly set + * and should not be overwritten by set_content_interpreation_params(). + */ + int ci_per_layer_overridden[MAX_NUM_MLAYERS]; /*! * Write the Buffer Removal Timing OBU */ @@ -3612,10 +3653,36 @@ static INLINE void check_ref_count_status_enc(AV2_COMP *cpi) { } } -// Returns true if current frame is a shown (visible) keyframe. +// Returns true if current frame is a shown CLK (closed-loop keyframe). +// Excludes OLKs (both displayed and hidden) and hidden intra fwd KFs. static INLINE bool av2_is_shown_keyframe(const AV2_COMP *cpi, FRAME_TYPE frame_type) { - return (frame_type == KEY_FRAME) && !cpi->no_show_fwd_kf; + return (frame_type == KEY_FRAME) && !cpi->is_fwd_kf; +} + +// Returns true if the current frame is a hidden forward keyframe that uses the +// OLK mechanism (KEY_FRAME with no_show_fwd_kf): non-monotonic open GOP. +static INLINE bool av2_is_olk_forward_keyframe(const AV2_COMP *cpi) { + return cpi->no_show_fwd_kf && + cpi->common.current_frame.frame_type == KEY_FRAME; +} + +// Returns true if the current frame is a hidden intra forward keyframe +// (intra_only_fwd_kf): monotonic open GOP. The INTRA_ONLY_FRAME provides a +// random access point but does not use OLK machinery. +static INLINE bool av2_is_fwd_intra_keyframe(const AV2_COMP *cpi) { + return cpi->no_show_fwd_kf && cpi->oxcf.kf_cfg.intra_only_fwd_kf && + cpi->common.current_frame.frame_type == INTRA_ONLY_FRAME; +} + +// Returns true if the forward keyframe at a GOP boundary should be hidden +// (no_show). The fwd KF is hidden only when there is a mechanism to display +// it later: either keyframe filtering (kf_filt >= 2, which uses an overlay) +// or SEF-based hidden frame output. For open_leading (non-monotonic OLK), +// the OLK is an implicit output frame and must NOT be hidden. +static INLINE bool av2_fwd_kf_should_be_hidden(const AV2_COMP *cpi) { + return cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 || + cpi->oxcf.ref_frm_cfg.add_sef_for_hidden_frames; } /*!\endcond */ diff --git a/av2/encoder/gop_structure.c b/av2/encoder/gop_structure.c index cb983bd05c..ea23d51ea2 100644 --- a/av2/encoder/gop_structure.c +++ b/av2/encoder/gop_structure.c @@ -322,15 +322,23 @@ static int construct_multi_layer_gf_structure( } if (has_hidden_fwd_kf) { - gf_group->update_type[frame_index] = - use_fwd_kf_overlay ? FWD_KF_OVERLAY_UPDATE : FWD_KF_SUCCESSOR_UPDATE; - gf_group->arf_src_offset[frame_index] = - cpi->common.number_mlayers * - ((use_fwd_kf_overlay ? 0 : 1) + gf_interval - cur_frame_index - 1); - gf_group->cur_frame_idx[frame_index] = cur_frame_index; - gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS; - gf_group->arf_boost[frame_index] = NORMAL_BOOST; - ++frame_index; + // For intra_only_fwd_kf (monotonic open GOP), the hidden intra frame + // is output via SEF like any other hidden frame — no FWD_KF_SUCCESSOR + // is needed (it would violate monotonic output ordering). Only + // non-monotonic OLK or keyframe-filtering modes need the + // successor/overlay. + if (!cpi->oxcf.kf_cfg.intra_only_fwd_kf) { + gf_group->update_type[frame_index] = use_fwd_kf_overlay + ? FWD_KF_OVERLAY_UPDATE + : FWD_KF_SUCCESSOR_UPDATE; + gf_group->arf_src_offset[frame_index] = + cpi->common.number_mlayers * + ((use_fwd_kf_overlay ? 0 : 1) + gf_interval - cur_frame_index - 1); + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS; + gf_group->arf_boost[frame_index] = NORMAL_BOOST; + ++frame_index; + } } set_multi_layer_params(twopass, gf_group, rc, frame_info, cur_frame_index, @@ -347,9 +355,11 @@ static int construct_multi_layer_gf_structure( gf_group->arf_boost[frame_index] = NORMAL_BOOST; ++frame_index; ++cur_frame_index; - if (!use_fwd_kf_overlay) { + if (!use_fwd_kf_overlay && !cpi->oxcf.kf_cfg.intra_only_fwd_kf) { // Add one more for the regular frame after the fwd kf sef (implicit - // output). + // output). Not needed for intra_only_fwd_kf: the hidden intra is + // output via SEF like any other hidden frame, and the next frame + // belongs to the next GF group. gf_group->update_type[frame_index] = OVERLAY_UPDATE; gf_group->arf_src_offset[frame_index] = 0; gf_group->cur_frame_idx[frame_index] = cur_frame_index; diff --git a/av2/encoder/pass2_strategy.c b/av2/encoder/pass2_strategy.c index 8418bffc07..e0b97df90e 100644 --- a/av2/encoder/pass2_strategy.c +++ b/av2/encoder/pass2_strategy.c @@ -2798,6 +2798,16 @@ static void setup_target_rate(AV2_COMP *cpi) { rc->base_frame_target = target_rate; } +// Determine if the current ARF is at a keyframe boundary (forward keyframe). +static int is_at_fwd_kf_boundary(const AV2_COMP *cpi, int src_index) { + const GF_GROUP *gf_group = &cpi->gf_group; + return src_index == cpi->rc.frames_to_key * (int)cpi->common.number_mlayers && + src_index != 0 && cpi->oxcf.kf_cfg.fwd_kf_enabled && + gf_group->size > 1 && + gf_group->update_type[gf_group->index] != FWD_KF_OVERLAY_UPDATE && + gf_group->update_type[gf_group->index] != FWD_KF_SUCCESSOR_UPDATE; +} + void av2_get_second_pass_params(AV2_COMP *cpi, EncodeFrameParams *const frame_params) { RATE_CONTROL *const rc = &cpi->rc; @@ -2813,21 +2823,26 @@ void av2_get_second_pass_params(AV2_COMP *cpi, setup_target_rate(cpi); int src_index = gf_group->arf_src_offset[gf_group->index]; - if (src_index == cpi->rc.frames_to_key * (int)cpi->common.number_mlayers && - src_index != 0 && cpi->oxcf.kf_cfg.fwd_kf_enabled && - gf_group->size > 1 && - gf_group->update_type[gf_group->index] != FWD_KF_OVERLAY_UPDATE && - gf_group->update_type[gf_group->index] != FWD_KF_SUCCESSOR_UPDATE) { - cpi->no_show_fwd_kf = 1; + // Determine if this ARF is at the KF boundary (forward keyframe). + if (is_at_fwd_kf_boundary(cpi, src_index)) { + cpi->is_fwd_kf = 1; + // Hidden only when there is a mechanism to show it later + // (kf_filt overlay or SEF). For open_leading, the OLK is displayed + // directly as an implicit output frame. + cpi->no_show_fwd_kf = av2_fwd_kf_should_be_hidden(cpi) ? 1 : 0; } // If this is an arf frame then we dont want to read the stats file or // advance the input pointer as we already have what we need. if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE || update_type == KFFLT_UPDATE) { - if (cpi->no_show_fwd_kf) { + if (cpi->is_fwd_kf) { assert(update_type == ARF_UPDATE || update_type == KFFLT_UPDATE); - frame_params->frame_type = KEY_FRAME; - frame_params->frame_params_obu_type = OBU_OPEN_LOOP_KEY; + if (cpi->oxcf.kf_cfg.intra_only_fwd_kf) { + frame_params->frame_type = INTRA_ONLY_FRAME; + } else { + frame_params->frame_type = KEY_FRAME; + frame_params->frame_params_obu_type = OBU_OPEN_LOOP_KEY; + } } else { frame_params->frame_type = INTER_FRAME; update_subgop_stats(&cpi->gf_group, &cpi->subgop_stats, @@ -2848,6 +2863,11 @@ void av2_get_second_pass_params(AV2_COMP *cpi, int sframe_dist = oxcf->kf_cfg.sframe_dist; int sframe_mode = oxcf->kf_cfg.sframe_mode; int enable_sframe = oxcf->kf_cfg.enable_sframe; + // S-frames are only allowed in independent mlayers. + if (sframe_dist && !is_mlayer_independent(&cpi->common.seq_params, + cpi->common.mlayer_id)) { + sframe_dist = 0; + } CurrentFrame *const current_frame = &cpi->common.current_frame; // ARF_UPDATE and KFFLT_UPDATE is set as S_FRAME in the RA case if (sframe_dist && enable_sframe) { @@ -2911,13 +2931,15 @@ void av2_get_second_pass_params(AV2_COMP *cpi, oxcf->kf_cfg.key_freq_min *= cpi->common.number_mlayers; oxcf->kf_cfg.key_freq_max *= cpi->common.number_mlayers; } - if (cpi->gf_state.olk_overlay_last) { + if (cpi->gf_state.olk_overlay_last || + cpi->gf_state.fwd_intra_overlay_last) { const int kf_offset = -rc->frames_to_key; - // The olk key frame has been encoded. Next is the arf. + // The forward keyframe (OLK or hidden intra) has been encoded and output. + // Next is the arf — no CLK needed at this key period boundary. frame_params->frame_type = INTER_FRAME; frame_params->frame_params_obu_type = NUM_OBU_TYPES; - // Temporarily change decrease key frame interval since we've already seen - // the key frame in the OLK. + // Temporarily decrease key frame interval since we've already seen + // the key frame in the forward keyframe. oxcf->kf_cfg.key_freq_min = AVMMAX( 0, oxcf->kf_cfg.key_freq_min - (int)cpi->common.number_mlayers); oxcf->kf_cfg.key_freq_max = @@ -2925,6 +2947,7 @@ void av2_get_second_pass_params(AV2_COMP *cpi, oxcf->kf_cfg.key_freq_max - (int)cpi->common.number_mlayers); find_next_key_frame(cpi, &this_frame); rc->frames_since_key += kf_offset * (int)cpi->common.number_mlayers; + cpi->gf_state.fwd_intra_overlay_last = 0; } else { assert(rc->frames_to_key >= -1); frame_params->frame_type = KEY_FRAME; @@ -3026,13 +3049,11 @@ void av2_get_second_pass_params(AV2_COMP *cpi, gf_group->update_type[gf_group->size - 1] == OVERLAY_UPDATE; cpi->no_show_fwd_kf = 0; + cpi->is_fwd_kf = 0; int src_index = gf_group->arf_src_offset[gf_group->index]; - if (src_index == cpi->rc.frames_to_key * (int)cpi->common.number_mlayers && - src_index != 0 && cpi->oxcf.kf_cfg.fwd_kf_enabled && - gf_group->size > 1 && - gf_group->update_type[gf_group->index] != FWD_KF_OVERLAY_UPDATE && - gf_group->update_type[gf_group->index] != FWD_KF_SUCCESSOR_UPDATE) { - cpi->no_show_fwd_kf = 1; + if (is_at_fwd_kf_boundary(cpi, src_index)) { + cpi->is_fwd_kf = 1; + cpi->no_show_fwd_kf = av2_fwd_kf_should_be_hidden(cpi) ? 1 : 0; } const int update_type = gf_group->update_type[gf_group->index]; @@ -3040,9 +3061,13 @@ void av2_get_second_pass_params(AV2_COMP *cpi, !(update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE); if (update_type == ARF_UPDATE) { - if (cpi->no_show_fwd_kf) { - frame_params->frame_type = KEY_FRAME; - frame_params->frame_params_obu_type = OBU_OPEN_LOOP_KEY; + if (cpi->is_fwd_kf) { + if (cpi->oxcf.kf_cfg.intra_only_fwd_kf) { + frame_params->frame_type = INTRA_ONLY_FRAME; + } else { + frame_params->frame_type = KEY_FRAME; + frame_params->frame_params_obu_type = OBU_OPEN_LOOP_KEY; + } } else { frame_params->frame_type = rc->frames_since_key == 0 ? KEY_FRAME : (frame_params->frame_type == S_FRAME) diff --git a/av2/encoder/ratectrl.c b/av2/encoder/ratectrl.c index a3581edfb6..62e9aa3b60 100644 --- a/av2/encoder/ratectrl.c +++ b/av2/encoder/ratectrl.c @@ -1598,11 +1598,9 @@ static int rc_pick_q_and_bounds(const AV2_COMP *cpi, int width, int height, gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; if (frame_is_intra_only(cm)) { - const int is_fwd_kf = cm->current_frame.frame_type == KEY_FRAME && - cm->immediate_output_picture == 0 && - cpi->no_show_fwd_kf; + const int is_fwd_kf_local = cpi->is_fwd_kf; get_intra_q_and_bounds(cpi, width, height, &active_best_quality, - &active_worst_quality, qp, is_fwd_kf); + &active_worst_quality, qp, is_fwd_kf_local); #ifdef STRICT_RC active_best_quality = 0; #endif diff --git a/av2/exports_enc b/av2/exports_enc index aa0f232a64..365035b002 100644 --- a/av2/exports_enc +++ b/av2/exports_enc @@ -1,2 +1,6 @@ data avm_codec_av2_cx_algo text avm_codec_av2_cx +data av2_level_defs +text avm_wb_write_rice_golomb +text avm_wb_is_byte_aligned +text avm_wb_write_uleb diff --git a/avm/avm_image.h b/avm/avm_image.h index 0836005fd8..4b770c4ae0 100644 --- a/avm/avm_image.h +++ b/avm/avm_image.h @@ -390,6 +390,8 @@ typedef struct avm_image { int mlayer_id; /**< mlayer id of image */ int xlayer_id; /**< xlayer id of image */ int stream_id; /**< stream index [0..num_streams-1], -1 for global */ + unsigned int + display_order_hint; /**< display order hint for output reordering */ /*!\brief The following member may be set by the application to associate * data with this image. diff --git a/avm/avmcx.h b/avm/avmcx.h index 5ec1bca1d7..4f7059cce8 100644 --- a/avm/avmcx.h +++ b/avm/avmcx.h @@ -1249,6 +1249,56 @@ enum avme_enc_control_id { * the restricted_prediction_switch output ordering path. */ AV2E_SET_FORCE_DEFERRED_FRAMES_FOR_RAS_TEST = 185, + /*!\brief Codec control function to set obu_xlayer_id for multi-xlayer + * encoding, int parameter (0-30). + */ + AVME_SET_XLAYER_ID = 186, + + /*!\brief Codec control function to set mlayer_dependency_present_flag, + * unsigned int parameter (0 or 1). + */ + AV2E_SET_MLAYER_DEPENDENCY_PRESENT = 187, + + /*!\brief Codec control function to set mlayer dependency map for a given + * mlayer. Takes two parameters: mlayer index (unsigned int) and dependency + * bitmask (unsigned int). Bit j set means mlayer depends on mlayer j. + */ + AV2E_SET_MLAYER_DEPENDENCY_MAP = 188, + + /*!\brief Codec control function to enable intra-only forward keyframes. + * + * When enabled (and fwd_kf_enabled is true), forward keyframes are coded as + * INTRA_ONLY_FRAME instead of KEY_FRAME. This enables open GOP with + * monotonic output: the hidden intra frame is shown via SEF, and reference + * buffers are not reset. + * int parameter. + */ + AV2E_SET_INTRA_ONLY_FWD_KF = 188, + + /*!\brief Codec control function to set per-mlayer color_primaries. + * Takes two parameters: mlayer index (unsigned int) and color_primaries + * value (unsigned int). Populates ci_params_per_layer[mlayer_idx]. + */ + AV2E_SET_MLAYER_COLOR_PRIMARIES = 189, + + /*!\brief Codec control function to set per-mlayer transfer_characteristics. + * Takes two parameters: mlayer index (unsigned int) and + * transfer_characteristics value (unsigned int). + */ + AV2E_SET_MLAYER_TRANSFER_CHARACTERISTICS = 190, + + /*!\brief Codec control function to set per-mlayer matrix_coefficients. + * Takes two parameters: mlayer index (unsigned int) and + * matrix_coefficients value (unsigned int). + */ + AV2E_SET_MLAYER_MATRIX_COEFFICIENTS = 191, + + /*!\brief Codec control function to set per-mlayer color_range. + * Takes two parameters: mlayer index (unsigned int) and full_range_flag + * value (unsigned int, 0=limited, 1=full). + */ + AV2E_SET_MLAYER_COLOR_RANGE = 192, + }; /*!\brief avm 1-D scaling mode @@ -1776,6 +1826,13 @@ AVM_CTRL_USE_TYPE(AV2E_SET_MONOTONIC_OUTPUT_ORDER, int) AVM_CTRL_USE_TYPE(AV2E_SET_FORCE_DEFERRED_FRAMES_FOR_RAS_TEST, int) #define AVME_CTRL_AV2E_SET_FORCE_DEFERRED_FRAMES_FOR_RAS_TEST + +AVM_CTRL_USE_TYPE(AVME_SET_XLAYER_ID, int) +#define AVM_CTRL_AVME_SET_XLAYER_ID + +AVM_CTRL_USE_TYPE(AV2E_SET_INTRA_ONLY_FWD_KF, int) +#define AVME_CTRL_AV2E_SET_INTRA_ONLY_FWD_KF + /*!\endcond */ /*! @} - end defgroup avm_encoder */ #ifdef __cplusplus diff --git a/avm/avmdx.h b/avm/avmdx.h index e83a34305f..6856c7974a 100644 --- a/avm/avmdx.h +++ b/avm/avmdx.h @@ -162,6 +162,51 @@ typedef struct avm_screen_content_tools_info { int force_integer_mv; } avm_screen_content_tools_info; +/*!\brief Per-xlayer layer info exposed by AV2D_GET_LCR_INFO. + * + * Mirrors the key fields from the internal LCR structures for each + * extended layer. Populated from Global or Local LCR OBU data. + */ +typedef struct avm_xlayer_layer_info { + int xlayer_id; /**< Extended layer ID (0-30) */ + int layer_type; /**< 0=texture, 1=auxiliary, 2=stereo, 3=dependent */ + int auxiliary_type; /**< 0=alpha, 1=depth, 2=segmentation, 3=gain_map + (-1 if N/A) */ + int view_type; /**< 0=unspecified, 1=center, 2=left, 3=right, + 4=explicit */ + int max_width; /**< lcr_max_pic_width from RepresentationInfo */ + int max_height; /**< lcr_max_pic_height from RepresentationInfo */ + int num_mlayers; /**< Number of embedded (media) layers */ +} avm_xlayer_layer_info_t; + +/*!\brief LCR info exposed by AV2D_GET_LCR_INFO. + * + * Contains layer configuration for all xlayers in the stream, + * assembled from Global and/or Local LCR OBUs. + */ +typedef struct avm_lcr_info { + int num_xlayers; /**< Number of xlayers */ + avm_xlayer_layer_info_t xlayers[31]; /**< Per-xlayer info */ +} avm_lcr_info_t; + +/*!\brief Atlas info exposed by AV2D_GET_ATLAS_INFO. + * + * Contains atlas canvas dimensions and per-segment placement, + * populated from Atlas OBUs. + */ +typedef struct avm_atlas_info { + int atlas_width; /**< Canvas width */ + int atlas_height; /**< Canvas height */ + int num_segments; /**< Number of atlas segments */ + struct { + int xlayer_id; /**< Which xlayer this segment belongs to */ + int pos_x; /**< Top-left X position in canvas */ + int pos_y; /**< Top-left Y position in canvas */ + int width; /**< Segment width */ + int height; /**< Segment height */ + } segments[256]; /**< Per-segment info (MAX_NUM_ATLAS_SEGMENTS) */ +} avm_atlas_info_t; + /*!\brief Structure to hold the external reference frame pointer. * * Define a structure to hold the external reference frame pointer. @@ -342,6 +387,33 @@ enum avm_dec_control_id { AV2D_SET_BRU_OPT_MODE, + /*!\brief Codec control function to get LCR (Layer Configuration Record) + * info, avm_lcr_info_t* parameter + * + * Returns layer type, auxiliary type, view type, dimensions, and mlayer + * count for each extended layer. Populated from Global/Local LCR OBUs. + * Returns AVM_CODEC_ERROR if no LCR has been parsed yet. + */ + AV2D_GET_LCR_INFO, + + /*!\brief Codec control function to get Atlas segment info, + * avm_atlas_info_t* parameter + * + * Returns atlas canvas dimensions and per-segment placement (position, + * size, xlayer_id). Populated from Atlas OBUs. + * Returns AVM_CODEC_ERROR if no Atlas OBU has been parsed yet. + */ + AV2D_GET_ATLAS_INFO, + + /*!\brief Codec control function to get monotonic_output_order_flag, + * unsigned int* parameter + * + * Returns 1 if monotonic_output_order_flag is set in the sequence header, + * 0 otherwise. Returns AVM_CODEC_ERROR if no sequence header has been + * parsed yet. + */ + AV2D_GET_MONOTONIC_OUTPUT_ORDER, + AVM_DECODER_CTRL_ID_MAX, /*!\brief Codec control function to check the presence of forward key frames @@ -512,6 +584,16 @@ AVM_CTRL_USE_TYPE(AV2D_SET_OUTPUT_ALL_LAYERS, int) AVM_CTRL_USE_TYPE(AV2_SET_INSPECTION_CALLBACK, avm_inspect_init *) #define AVM_CTRL_AV2_SET_INSPECTION_CALLBACK + +AVM_CTRL_USE_TYPE(AV2D_GET_LCR_INFO, avm_lcr_info_t *) +#define AVM_CTRL_AV2D_GET_LCR_INFO + +AVM_CTRL_USE_TYPE(AV2D_GET_ATLAS_INFO, avm_atlas_info_t *) +#define AVM_CTRL_AV2D_GET_ATLAS_INFO + +AVM_CTRL_USE_TYPE(AV2D_GET_MONOTONIC_OUTPUT_ORDER, unsigned int *) +#define AVM_CTRL_AV2D_GET_MONOTONIC_OUTPUT_ORDER + /*!\endcond */ /*! @} - end defgroup avm_decoder */ #ifdef __cplusplus diff --git a/cfg/xlayer/annexG2_360degree_9xlayer.json b/cfg/xlayer/annexG2_360degree_9xlayer.json new file mode 100644 index 0000000000..f1522f0b2a --- /dev/null +++ b/cfg/xlayer/annexG2_360degree_9xlayer.json @@ -0,0 +1,168 @@ +{ + "comment": "Annex G.2: 360-degree viewport-dependent streaming with subpictures", + "comment2": "9 extended layers in a 3x3 grid covering 3840x1920 equirectangular projection", + "comment3": "Each xlayer has 3 embedded layers: texture, alpha, depth", + "comment4": "Uses LCR (not MSDO, which is limited to 4 streams)", + "xlayers": [ + { + "xlayer_id": 0, + "input": "subpic_topleft.y4m", + "width": 1280, + "height": 640, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 3, + "qp": 200, + "cpu_used": 5, + "comment": "Top-left subpicture, low quality (corner)" + }, + { + "xlayer_id": 1, + "input": "subpic_topcenter.y4m", + "width": 1280, + "height": 640, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 3, + "qp": 160, + "cpu_used": 5, + "comment": "Top-center subpicture, medium quality (adjacent)" + }, + { + "xlayer_id": 2, + "input": "subpic_topright.y4m", + "width": 1280, + "height": 640, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 3, + "qp": 200, + "cpu_used": 5, + "comment": "Top-right subpicture, low quality (corner)" + }, + { + "xlayer_id": 3, + "input": "subpic_midleft.y4m", + "width": 1280, + "height": 640, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 3, + "qp": 160, + "cpu_used": 5, + "comment": "Middle-left subpicture, medium quality (adjacent)" + }, + { + "xlayer_id": 4, + "input": "subpic_center.y4m", + "width": 1280, + "height": 640, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 3, + "qp": 128, + "cpu_used": 5, + "comment": "CENTER VIEWPORT subpicture, HIGH quality" + }, + { + "xlayer_id": 5, + "input": "subpic_midright.y4m", + "width": 1280, + "height": 640, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 3, + "qp": 160, + "cpu_used": 5, + "comment": "Middle-right subpicture, medium quality (adjacent)" + }, + { + "xlayer_id": 6, + "input": "subpic_botleft.y4m", + "width": 1280, + "height": 640, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 3, + "qp": 200, + "cpu_used": 5, + "comment": "Bottom-left subpicture, low quality (corner)" + }, + { + "xlayer_id": 7, + "input": "subpic_botcenter.y4m", + "width": 1280, + "height": 640, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 3, + "qp": 160, + "cpu_used": 5, + "comment": "Bottom-center subpicture, medium quality (adjacent)" + }, + { + "xlayer_id": 8, + "input": "subpic_botright.y4m", + "width": 1280, + "height": 640, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 3, + "qp": 200, + "cpu_used": 5, + "comment": "Bottom-right subpicture, low quality (corner/back-facing)" + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "msdo": { + "enable": false, + "comment": "MSDO limited to 4 streams; 9 xlayers requires LCR only" + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [4], + "comment": "OP0: Center viewport subpicture only (low bandwidth)" + }, + { + "intent": 1, + "xlayer_map": [1, 3, 4, 5, 7], + "comment": "OP1: Center + adjacent subpictures (medium bandwidth)" + }, + { + "intent": 2, + "xlayer_map": [0, 1, 2, 3, 4, 5, 6, 7, 8], + "comment": "OP2: All 9 subpictures - complete sphere coverage (high bandwidth)" + } + ] + } + ], + "atlas": { + "enable": true, + "mode": 0, + "comment": "Enhanced Atlas: 3x3 uniform grid composing 3840x1920 equirectangular" + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "360degree_9subpic_muxed.obu" +} diff --git a/cfg/xlayer/annexG3_videoconf_3xlayer.json b/cfg/xlayer/annexG3_videoconf_3xlayer.json new file mode 100644 index 0000000000..de325233f8 --- /dev/null +++ b/cfg/xlayer/annexG3_videoconf_3xlayer.json @@ -0,0 +1,90 @@ +{ + "comment": "Annex G.3: Subpicture composition for video conferencing", + "comment2": "3 extended layers: main speaker (high-res) + 2 participants (medium-res)", + "comment3": "Atlas composes into a 1920x1080 virtual canvas with non-uniform grid", + "comment4": "Main speaker at left (1280x1080), participants stacked at right (640x540 each)", + "xlayers": [ + { + "xlayer_id": 0, + "input": "main_speaker.y4m", + "width": 1280, + "height": 1080, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "comment": "Main speaker, high resolution, high bitrate" + }, + { + "xlayer_id": 1, + "input": "participant2.y4m", + "width": 480, + "height": 360, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5, + "comment": "Participant 2, encoded at 480x360, upsampled to 640x540 in atlas" + }, + { + "xlayer_id": 2, + "input": "participant3.y4m", + "width": 640, + "height": 540, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5, + "comment": "Participant 3, medium resolution" + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 6, + "dependent_xlayers": false, + "doh_constraint": true, + "comment": "purpose_id=6: Multiview Playback" + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: Main speaker only (mobile/low bandwidth)" + }, + { + "intent": 1, + "xlayer_map": [0, 1], + "comment": "OP1: Main speaker + participant 2" + }, + { + "intent": 2, + "xlayer_map": [0, 1, 2], + "comment": "OP2: All participants - full conferencing view" + } + ] + } + ], + "atlas": { + "enable": true, + "mode": 0, + "comment": "Enhanced Atlas: 2x2 non-uniform grid, main speaker spans left column" + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "videoconf_3participant_muxed.obu" +} diff --git a/cfg/xlayer/annexG4_roi_scalable_2xlayer.json b/cfg/xlayer/annexG4_roi_scalable_2xlayer.json new file mode 100644 index 0000000000..72f8277c16 --- /dev/null +++ b/cfg/xlayer/annexG4_roi_scalable_2xlayer.json @@ -0,0 +1,76 @@ +{ + "comment": "Annex G.4: Region-of-interest scalability for sports broadcast", + "comment2": "2 extended layers: base stadium view + high-quality field-of-play enhancement", + "comment3": "Base layer coded at 1920x1088 (padded for SB alignment), cropped to 1920x1080", + "comment4": "Enhancement layer is 1280x720 field-of-play overlaid at center position (320,180)", + "xlayers": [ + { + "xlayer_id": 0, + "input": "stadium_full.y4m", + "width": 1920, + "height": 1080, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5, + "comment": "Full stadium view, base quality. Coded as 1920x1088 with 8px bottom padding, cropped to 1920x1080" + }, + { + "xlayer_id": 1, + "input": "field_of_play.y4m", + "width": 1280, + "height": 720, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "comment": "Field-of-play region, high quality enhancement. Overlays base at center (320,180)" + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: Base layer only - full stadium at base quality (with normative cropping)" + }, + { + "intent": 1, + "xlayer_map": [1], + "comment": "OP1: Enhancement only - field-of-play at high quality (no stadium context)" + }, + { + "intent": 2, + "xlayer_map": [0, 1], + "comment": "OP2: Full quality - stadium with high-quality field-of-play overlay" + } + ] + } + ], + "atlas": { + "enable": true, + "mode": 0, + "comment": "Enhanced Atlas: 3x3 non-uniform grid. Segment 0 spans all 9 regions (full frame). Segment 1 is center cell only (1280x720 at position 320,180). Enhancement overlays base via lcr_priority_order." + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "sports_roi_scalable_muxed.obu" +} diff --git a/cfg/xlayer/stereo_2layer.json b/cfg/xlayer/stereo_2layer.json new file mode 100644 index 0000000000..9505683c40 --- /dev/null +++ b/cfg/xlayer/stereo_2layer.json @@ -0,0 +1,70 @@ +{ + "comment": "Stereo simulcast: left and right views as separate xlayers. Each xlayer is encoded independently without inter-layer prediction. For stereo with inter-layer prediction, use stereo_embedded_2ml.json instead.", + "xlayers": [ + { + "xlayer_id": 0, + "input": "left_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "stereo", + "view_type": "left", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5 + }, + { + "xlayer_id": 1, + "input": "right_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "stereo", + "view_type": "right", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5 + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: left view only (mono)" + }, + { + "intent": 1, + "xlayer_map": [0, 1], + "comment": "OP1: stereo (both views)" + } + ] + } + ], + "atlas": { + "enable": false + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "stereo_muxed.obu" +} diff --git a/cfg/xlayer/stereo_embedded_2ml.json b/cfg/xlayer/stereo_embedded_2ml.json new file mode 100644 index 0000000000..8ae3d9a36d --- /dev/null +++ b/cfg/xlayer/stereo_embedded_2ml.json @@ -0,0 +1,33 @@ +{ + "comment": "Stereo via embedded layers: left and right views as 2 mlayers within a single xlayer. mlayer 1 (right) uses inter-layer prediction from mlayer 0 (left) via depends_on. For simulcast stereo without inter-layer prediction, use stereo_2layer.json instead.", + "inputs": [ + { "name": "left", "filename": "left.yuv", "width": 1920, "height": 1080 }, + { "name": "right", "filename": "right.yuv", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "left", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "layer_type": "stereo", "view_type": "left", + "color_primaries": 1, "transfer_characteristics": 1, + "matrix_coefficients": 1, "full_range_flag": 0, + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1:1", "input_source": "left", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "right", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [0] } + ], + "qp": 128, "cpu_used": 5 } + ], + "ops": [{ + "ops_id": 0, "priority": 0, "intent_present": true, "ptl_present": true, + "mlayer_info_idc": 2, + "operating_points": [ + { "intent": 0, "xlayer_map": [0], "mlayer_count": [1] }, + { "intent": 1, "xlayer_map": [0], "mlayer_count": [2] } + ] + }], + "output": "stereo_embedded.obu" +} diff --git a/cfg/xlayer/subpicture_3region.json b/cfg/xlayer/subpicture_3region.json new file mode 100644 index 0000000000..1195f4bd82 --- /dev/null +++ b/cfg/xlayer/subpicture_3region.json @@ -0,0 +1,101 @@ +{ + "xlayers": [ + { + "xlayer_id": 0, + "input": "region_a.y4m", + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "texture", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "atlas_pos_x": 0, + "atlas_pos_y": 0 + }, + { + "xlayer_id": 1, + "input": "region_b.y4m", + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "texture", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "atlas_pos_x": 960, + "atlas_pos_y": 0 + }, + { + "xlayer_id": 2, + "input": "region_c.y4m", + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "texture", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "atlas_pos_x": 0, + "atlas_pos_y": 540 + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: region A only" + }, + { + "intent": 0, + "xlayer_map": [1], + "comment": "OP1: region B only" + }, + { + "intent": 0, + "xlayer_map": [2], + "comment": "OP2: region C only" + }, + { + "intent": 1, + "xlayer_map": [0, 1, 2], + "comment": "OP3: full picture (all regions)" + } + ] + } + ], + "atlas": { + "enable": true, + "mode": 0, + "width": 1920, + "height": 1080, + "uniform_spacing": false + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "subpicture_3region_muxed.obu" +} diff --git a/cfg/xlayer/subpicture_4quadrant.json b/cfg/xlayer/subpicture_4quadrant.json new file mode 100644 index 0000000000..b7c73722e0 --- /dev/null +++ b/cfg/xlayer/subpicture_4quadrant.json @@ -0,0 +1,126 @@ +{ + "inputs": [ + { + "name": "default", + "filename": "video_1920x1080.yuv", + "width": 1920, + "height": 1080 + } + ], + "xlayers": [ + { + "xlayer_id": 0, + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "texture", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "atlas_pos_x": 0, + "atlas_pos_y": 0 + }, + { + "xlayer_id": 1, + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "texture", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "atlas_pos_x": 960, + "atlas_pos_y": 0 + }, + { + "xlayer_id": 2, + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "texture", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "atlas_pos_x": 0, + "atlas_pos_y": 540 + }, + { + "xlayer_id": 3, + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "texture", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "atlas_pos_x": 960, + "atlas_pos_y": 540 + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: top-left quadrant" + }, + { + "intent": 0, + "xlayer_map": [1], + "comment": "OP1: top-right quadrant" + }, + { + "intent": 0, + "xlayer_map": [2], + "comment": "OP2: bottom-left quadrant" + }, + { + "intent": 0, + "xlayer_map": [3], + "comment": "OP3: bottom-right quadrant" + }, + { + "intent": 1, + "xlayer_map": [0, 1, 2, 3], + "comment": "OP4: full picture (all quadrants)" + } + ] + } + ], + "atlas": { + "enable": true, + "mode": 0, + "width": 1920, + "height": 1080, + "uniform_spacing": false + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "subpicture_4quadrant_muxed.obu" +} diff --git a/cfg/xlayer/subpicture_embedded_4q.json b/cfg/xlayer/subpicture_embedded_4q.json new file mode 100644 index 0000000000..ceb596a17a --- /dev/null +++ b/cfg/xlayer/subpicture_embedded_4q.json @@ -0,0 +1,26 @@ +{ + "inputs": [ + { "name": "video", "filename": "video.yuv", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "video", "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "num_embedded_layers": 4, + "embedded_layers": [ + { "scaling_mode": "1:1", "input_source": "video", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 960, "height": 540, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "video", + "atlas_pos_x": 960, "atlas_pos_y": 0, "width": 960, "height": 540, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "video", + "atlas_pos_x": 0, "atlas_pos_y": 540, "width": 960, "height": 540, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "video", + "atlas_pos_x": 960, "atlas_pos_y": 540, "width": 960, "height": 540, + "depends_on": [] } + ], + "qp": 128, "cpu_used": 5 } + ], + "output": "subpic_embedded.obu" +} diff --git a/cfg/xlayer/subpicture_texture_alpha_4q.json b/cfg/xlayer/subpicture_texture_alpha_4q.json new file mode 100644 index 0000000000..d2a4258f68 --- /dev/null +++ b/cfg/xlayer/subpicture_texture_alpha_4q.json @@ -0,0 +1,127 @@ +{ + "inputs": [ + { + "name": "texture", + "filename": "texture_1920x1080.yuv", + "width": 1920, + "height": 1080 + }, + { + "name": "alpha", + "filename": "alpha_1920x1080.yuv", + "width": 1920, + "height": 1080 + } + ], + "xlayers": [ + { + "xlayer_id": 0, + "input_source": "texture", + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "texture", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 9, + "lag_in_frames": 0, + "atlas_pos_x": 0, + "atlas_pos_y": 0 + }, + { + "xlayer_id": 1, + "input_source": "texture", + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "texture", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 9, + "lag_in_frames": 0, + "atlas_pos_x": 960, + "atlas_pos_y": 0 + }, + { + "xlayer_id": 2, + "input_source": "alpha", + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "auxiliary", + "auxiliary_type": "alpha", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 9, + "lag_in_frames": 0, + "atlas_pos_x": 0, + "atlas_pos_y": 0 + }, + { + "xlayer_id": 3, + "input_source": "alpha", + "width": 960, + "height": 540, + "profile": 3, + "level": 12, + "tier": 0, + "layer_type": "auxiliary", + "auxiliary_type": "alpha", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 9, + "lag_in_frames": 0, + "atlas_pos_x": 960, + "atlas_pos_y": 0 + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0, 1], + "comment": "OP0: texture only" + }, + { + "intent": 1, + "xlayer_map": [0, 1, 2, 3], + "comment": "OP1: texture + alpha" + } + ] + } + ], + "atlas": { + "enable": true, + "mode": 0, + "width": 1920, + "height": 1080, + "uniform_spacing": false + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "subpic_tex_alpha_muxed.obu" +} diff --git a/cfg/xlayer/test_scalable_closed_mono.json b/cfg/xlayer/test_scalable_closed_mono.json new file mode 100644 index 0000000000..4362d3f27d --- /dev/null +++ b/cfg/xlayer/test_scalable_closed_mono.json @@ -0,0 +1,41 @@ +{ + "comment": "Test: spatial scalability 2-mlayer, closed GOP, monotonic output. Uses inter-layer prediction (depends_on).", + "xlayers": [ + { + "xlayer_id": 0, + "input": "BasketballPass_416x240_50.yuv", + "width": 416, + "height": 240, + "layer_type": "texture", + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", "depends_on": [] }, + { "scaling_mode": "1:1", "depends_on": [0] } + ], + "qp": 128, + "cpu_used": 9, + "lag_in_frames": 0, + "gop_mode": 0, + "codec_controls": [ + ["enable_deblocking", 0], + ["enable_cdef", 0], + ["enable_restoration", 0], + ["enable_tpl_model", 0], + ["enable_keyframe_filtering", 0], + ["enable_global_motion", 0], + ["enable_warped_motion", 0], + ["enable_intrabc", 0], + ["enable_palette", 0], + ["enable_interintra_comp", 0] + ] + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "doh_constraint": true + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "test_scalable_closed_mono.obu" +} diff --git a/cfg/xlayer/test_scalable_closed_nonmono.json b/cfg/xlayer/test_scalable_closed_nonmono.json new file mode 100644 index 0000000000..24e19d4bc3 --- /dev/null +++ b/cfg/xlayer/test_scalable_closed_nonmono.json @@ -0,0 +1,41 @@ +{ + "comment": "Test: spatial scalability 2-mlayer, closed GOP, non-monotonic output. Uses inter-layer prediction (depends_on).", + "xlayers": [ + { + "xlayer_id": 0, + "input": "BasketballPass_416x240_50.yuv", + "width": 416, + "height": 240, + "layer_type": "texture", + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", "depends_on": [] }, + { "scaling_mode": "1:1", "depends_on": [0] } + ], + "qp": 128, + "cpu_used": 9, + "lag_in_frames": 0, + "gop_mode": 0, + "codec_controls": [ + ["enable_deblocking", 0], + ["enable_cdef", 0], + ["enable_restoration", 0], + ["enable_tpl_model", 0], + ["enable_keyframe_filtering", 0], + ["enable_global_motion", 0], + ["enable_warped_motion", 0], + ["enable_intrabc", 0], + ["enable_palette", 0], + ["enable_interintra_comp", 0] + ] + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "doh_constraint": true + }, + "combined_tu": true, + "monotonic_output_order": false, + "output": "test_scalable_closed_nonmono.obu" +} diff --git a/cfg/xlayer/test_scalable_open_leading_nonmono.json b/cfg/xlayer/test_scalable_open_leading_nonmono.json new file mode 100644 index 0000000000..b6a23c45c8 --- /dev/null +++ b/cfg/xlayer/test_scalable_open_leading_nonmono.json @@ -0,0 +1,41 @@ +{ + "comment": "Test: spatial scalability 2-mlayer, open GOP (OLK + leading), non-monotonic. Uses inter-layer prediction. Requires lag_in_frames > 0 for forward keyframe.", + "xlayers": [ + { + "xlayer_id": 0, + "input": "BasketballPass_416x240_50.yuv", + "width": 416, + "height": 240, + "layer_type": "texture", + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", "depends_on": [] }, + { "scaling_mode": "1:1", "depends_on": [0] } + ], + "qp": 128, + "cpu_used": 9, + "lag_in_frames": 19, + "kf_max_dist": 9, + "gop_mode": 1, + "codec_controls": [ + ["enable_deblocking", 0], + ["enable_cdef", 0], + ["enable_restoration", 0], + ["enable_tpl_model", 0], + ["enable_global_motion", 0], + ["enable_warped_motion", 0], + ["enable_intrabc", 0], + ["enable_palette", 0], + ["enable_interintra_comp", 0] + ] + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "doh_constraint": true + }, + "combined_tu": true, + "monotonic_output_order": false, + "output": "test_scalable_open_leading_nonmono.obu" +} diff --git a/cfg/xlayer/test_scalable_open_sef_mono.json b/cfg/xlayer/test_scalable_open_sef_mono.json new file mode 100644 index 0000000000..b223a30844 --- /dev/null +++ b/cfg/xlayer/test_scalable_open_sef_mono.json @@ -0,0 +1,41 @@ +{ + "comment": "Test: spatial scalability 2-mlayer, open GOP (INTRA_ONLY_FRAME + SEF), monotonic. Uses inter-layer prediction. Forward keyframe coded as INTRA_ONLY_FRAME (no ref reset).", + "xlayers": [ + { + "xlayer_id": 0, + "input": "BasketballPass_416x240_50.yuv", + "width": 416, + "height": 240, + "layer_type": "texture", + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", "depends_on": [] }, + { "scaling_mode": "1:1", "depends_on": [0] } + ], + "qp": 128, + "cpu_used": 9, + "lag_in_frames": 19, + "kf_max_dist": 9, + "gop_mode": 2, + "codec_controls": [ + ["enable_deblocking", 0], + ["enable_cdef", 0], + ["enable_restoration", 0], + ["enable_tpl_model", 0], + ["enable_global_motion", 0], + ["enable_warped_motion", 0], + ["enable_intrabc", 0], + ["enable_palette", 0], + ["enable_interintra_comp", 0] + ] + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "doh_constraint": true + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "test_scalable_open_sef_mono.obu" +} diff --git a/cfg/xlayer/texture_2mlayer_fast.json b/cfg/xlayer/texture_2mlayer_fast.json new file mode 100644 index 0000000000..1acdee3f0f --- /dev/null +++ b/cfg/xlayer/texture_2mlayer_fast.json @@ -0,0 +1,37 @@ +{ + "comment": "Fast 2-embedded-layer config for debugging. Uses spatial scalability (1/2 + 1:1) with expensive coding tools disabled via codec_controls.", + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture_1080p.y4m", + "width": 416, + "height": 240, + "layer_type": "texture", + "num_embedded_layers": 2, + "scaling_mode": ["1/2", "1:1"], + "qp": 128, + "cpu_used": 9, + "lag_in_frames": 0, + "codec_controls": [ + ["enable_deblocking", 0], + ["enable_cdef", 0], + ["enable_restoration", 0], + ["enable_tpl_model", 0], + ["enable_keyframe_filtering", 0], + ["enable_global_motion", 0], + ["enable_warped_motion", 0], + ["enable_intrabc", 0], + ["enable_palette", 0], + ["enable_interintra_comp", 0] + ] + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "doh_constraint": true + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "texture_2ml_fast.obu" +} diff --git a/cfg/xlayer/texture_alpha_depth_3layer.json b/cfg/xlayer/texture_alpha_depth_3layer.json new file mode 100644 index 0000000000..1683fda8db --- /dev/null +++ b/cfg/xlayer/texture_alpha_depth_3layer.json @@ -0,0 +1,101 @@ +{ + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "texture", + "view_type": "center", + "color_primaries": 1, + "transfer_characteristics": 1, + "matrix_coefficients": 1, + "full_range_flag": 0, + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5 + }, + { + "xlayer_id": 1, + "input": "alpha_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "auxiliary", + "auxiliary_type": "alpha", + "color_primaries": 1, + "transfer_characteristics": 13, + "matrix_coefficients": 0, + "full_range_flag": 1, + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 140, + "cpu_used": 5 + }, + { + "xlayer_id": 2, + "input": "depth_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "color_primaries": 1, + "transfer_characteristics": 1, + "matrix_coefficients": 0, + "full_range_flag": 1, + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5 + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: texture only" + }, + { + "intent": 1, + "xlayer_map": [0, 1], + "comment": "OP1: texture + alpha" + }, + { + "intent": 2, + "xlayer_map": [0, 1, 2], + "comment": "OP2: texture + alpha + depth (all layers)" + } + ] + } + ], + "atlas": { + "enable": false + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "texture_alpha_depth_muxed.obu" +} diff --git a/cfg/xlayer/texture_depth_2layer.json b/cfg/xlayer/texture_depth_2layer.json new file mode 100644 index 0000000000..978c99b2d4 --- /dev/null +++ b/cfg/xlayer/texture_depth_2layer.json @@ -0,0 +1,73 @@ +{ + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5 + }, + { + "xlayer_id": 1, + "input": "depth_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5 + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "local_lcr": { + "enable": true, + "mode": "both" + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: texture only" + }, + { + "intent": 1, + "xlayer_map": [0, 1], + "comment": "OP1: texture + depth" + } + ] + } + ], + "atlas": { + "enable": false + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "texture_depth_muxed.obu" +} diff --git a/cfg/xlayer/texture_depth_2layer_3ml.json b/cfg/xlayer/texture_depth_2layer_3ml.json new file mode 100644 index 0000000000..ba806f45ef --- /dev/null +++ b/cfg/xlayer/texture_depth_2layer_3ml.json @@ -0,0 +1,65 @@ +{ + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture.y4m", + "width": 1920, + "height": 1080, + "num_embedded_layers": 3, + "scaling_mode": ["1/4", "1/2", "1:1"], + "layer_type": "texture", + "color_primaries": 1, + "transfer_characteristics": 1, + "matrix_coefficients": 1, + "full_range_flag": 0, + "qp": 128, + "cpu_used": 9 + }, + { + "xlayer_id": 1, + "input": "depth.y4m", + "width": 1920, + "height": 1080, + "num_embedded_layers": 1, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "color_primaries": 1, + "transfer_characteristics": 1, + "matrix_coefficients": 0, + "full_range_flag": 1, + "qp": 160, + "cpu_used": 9 + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 6, + "doh_constraint": true + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "mlayer_count": [1] + }, + { + "intent": 1, + "xlayer_map": [0], + "mlayer_count": [3] + }, + { + "intent": 2, + "xlayer_map": [0, 1], + "mlayer_count": [3, 1] + } + ] + } + ], + "output": "texture_depth_3ml.obu" +} diff --git a/cfg/xlayer/texture_depth_2layer_clk.json b/cfg/xlayer/texture_depth_2layer_clk.json new file mode 100644 index 0000000000..2ba23b9698 --- /dev/null +++ b/cfg/xlayer/texture_depth_2layer_clk.json @@ -0,0 +1,77 @@ +{ + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "kf_max_dist": 8, + "gop_mode": "closed" + }, + { + "xlayer_id": 1, + "input": "depth_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5, + "kf_max_dist": 8, + "gop_mode": "closed" + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "local_lcr": { + "enable": true, + "mode": "both" + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: texture only" + }, + { + "intent": 1, + "xlayer_map": [0, 1], + "comment": "OP1: texture + depth" + } + ] + } + ], + "atlas": { + "enable": false + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "texture_depth_clk_muxed.obu" +} diff --git a/cfg/xlayer/texture_depth_2layer_fast.json b/cfg/xlayer/texture_depth_2layer_fast.json new file mode 100644 index 0000000000..081c072d49 --- /dev/null +++ b/cfg/xlayer/texture_depth_2layer_fast.json @@ -0,0 +1,70 @@ +{ + "comment": "Fast texture + depth config for debugging. Disables expensive coding tools via codec_controls to minimize encode time.", + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture_1080p.y4m", + "width": 1920, + "height": 1080, + "layer_type": "texture", + "qp": 128, + "cpu_used": 9, + "lag_in_frames": 0, + "codec_controls": [ + ["enable_deblocking", 0], + ["enable_cdef", 0], + ["enable_restoration", 0], + ["enable_tpl_model", 0], + ["enable_keyframe_filtering", 0], + ["enable_global_motion", 0], + ["enable_warped_motion", 0], + ["enable_intrabc", 0], + ["enable_palette", 0], + ["enable_interintra_comp", 0] + ] + }, + { + "xlayer_id": 1, + "input": "depth_1080p.y4m", + "width": 1920, + "height": 1080, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "qp": 160, + "cpu_used": 9, + "lag_in_frames": 0, + "codec_controls": [ + ["enable_deblocking", 0], + ["enable_cdef", 0], + ["enable_restoration", 0], + ["enable_tpl_model", 0], + ["enable_keyframe_filtering", 0], + ["enable_global_motion", 0], + ["enable_warped_motion", 0], + ["enable_intrabc", 0], + ["enable_palette", 0], + ["enable_interintra_comp", 0] + ] + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "doh_constraint": true + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0] }, + { "intent": 1, "xlayer_map": [0, 1] } + ] + } + ], + "combined_tu": true, + "monotonic_output_order": true, + "output": "texture_depth_fast.obu" +} diff --git a/cfg/xlayer/texture_depth_2layer_local_only.json b/cfg/xlayer/texture_depth_2layer_local_only.json new file mode 100644 index 0000000000..7d6178d44d --- /dev/null +++ b/cfg/xlayer/texture_depth_2layer_local_only.json @@ -0,0 +1,73 @@ +{ + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5 + }, + { + "xlayer_id": 1, + "input": "depth_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5 + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "local_lcr": { + "enable": true, + "mode": "local_only" + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: texture only" + }, + { + "intent": 1, + "xlayer_map": [0, 1], + "comment": "OP1: texture + depth" + } + ] + } + ], + "atlas": { + "enable": false + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "texture_depth_local_only_muxed.obu" +} diff --git a/cfg/xlayer/texture_depth_2layer_open_leading.json b/cfg/xlayer/texture_depth_2layer_open_leading.json new file mode 100644 index 0000000000..a11e612885 --- /dev/null +++ b/cfg/xlayer/texture_depth_2layer_open_leading.json @@ -0,0 +1,77 @@ +{ + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "kf_max_dist": 16, + "gop_mode": "open_leading" + }, + { + "xlayer_id": 1, + "input": "depth_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5, + "kf_max_dist": 16, + "gop_mode": "open_leading" + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "local_lcr": { + "enable": true, + "mode": "both" + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: texture only" + }, + { + "intent": 1, + "xlayer_map": [0, 1], + "comment": "OP1: texture + depth" + } + ] + } + ], + "atlas": { + "enable": false + }, + "combined_tu": true, + "monotonic_output_order": false, + "output": "texture_depth_open_leading_muxed.obu" +} diff --git a/cfg/xlayer/texture_depth_2layer_open_sef.json b/cfg/xlayer/texture_depth_2layer_open_sef.json new file mode 100644 index 0000000000..b9a1914df8 --- /dev/null +++ b/cfg/xlayer/texture_depth_2layer_open_sef.json @@ -0,0 +1,77 @@ +{ + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "texture", + "view_type": "center", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 128, + "cpu_used": 5, + "kf_max_dist": 16, + "gop_mode": "open_sef" + }, + { + "xlayer_id": 1, + "input": "depth_1080p.y4m", + "width": 1920, + "height": 1080, + "profile": 3, + "level": 16, + "tier": 0, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "num_temporal_layers": 1, + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5, + "kf_max_dist": 16, + "gop_mode": "open_sef" + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true + }, + "local_lcr": { + "enable": true, + "mode": "local_only" + }, + "msdo": { + "enable": false + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { + "intent": 0, + "xlayer_map": [0], + "comment": "OP0: texture only" + }, + { + "intent": 1, + "xlayer_map": [0, 1], + "comment": "OP1: texture + depth" + } + ] + } + ], + "atlas": { + "enable": false + }, + "combined_tu": true, + "monotonic_output_order": true, + "output": "texture_depth_open_sef_muxed.obu" +} diff --git a/cfg/xlayer/texture_depth_2xl_2ml_closed_mono.json b/cfg/xlayer/texture_depth_2xl_2ml_closed_mono.json new file mode 100644 index 0000000000..2cf9a099aa --- /dev/null +++ b/cfg/xlayer/texture_depth_2xl_2ml_closed_mono.json @@ -0,0 +1,55 @@ +{ + "comment": "2 xlayers × 2 mlayers, closed GOP, monotonic output. Texture has spatial scalability (1/2, 1:1). Depth is single-layer. Monotonic: hidden frames (ARF, INTNL) are output via SEF in display order.", + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture.y4m", + "width": 1920, + "height": 1080, + "layer_type": "texture", + "num_embedded_layers": 2, + "scaling_mode": ["1/2", "1:1"], + "qp": 128, + "cpu_used": 5, + "lag_in_frames": 19, + "gop_mode": "closed" + }, + { + "xlayer_id": 1, + "input": "depth.y4m", + "width": 1920, + "height": 1080, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5, + "lag_in_frames": 19, + "gop_mode": "closed" + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "doh_constraint": true + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0], "mlayer_count": [1], + "comment": "OP0: texture at 1/2 resolution" }, + { "intent": 1, "xlayer_map": [0], "mlayer_count": [2], + "comment": "OP1: texture at full resolution (both mlayers)" }, + { "intent": 2, "xlayer_map": [0, 1], "mlayer_count": [2, 1], + "comment": "OP2: texture + depth" } + ] + } + ], + "combined_tu": true, + "monotonic_output_order": true, + "output": "texture_depth_2xl_2ml_closed_mono.obu" +} diff --git a/cfg/xlayer/texture_depth_2xl_2ml_closed_nonmono.json b/cfg/xlayer/texture_depth_2xl_2ml_closed_nonmono.json new file mode 100644 index 0000000000..e43ffd7012 --- /dev/null +++ b/cfg/xlayer/texture_depth_2xl_2ml_closed_nonmono.json @@ -0,0 +1,55 @@ +{ + "comment": "2 xlayers × 2 mlayers, closed GOP, non-monotonic output. Texture has spatial scalability (1/2, 1:1). Depth is single-layer. Non-monotonic: ARF/INTNL frames are implicit output (decoder reorders).", + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture.y4m", + "width": 1920, + "height": 1080, + "layer_type": "texture", + "num_embedded_layers": 2, + "scaling_mode": ["1/2", "1:1"], + "qp": 128, + "cpu_used": 5, + "lag_in_frames": 19, + "gop_mode": "closed" + }, + { + "xlayer_id": 1, + "input": "depth.y4m", + "width": 1920, + "height": 1080, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5, + "lag_in_frames": 19, + "gop_mode": "closed" + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "doh_constraint": true + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0], "mlayer_count": [1], + "comment": "OP0: texture at 1/2 resolution" }, + { "intent": 1, "xlayer_map": [0], "mlayer_count": [2], + "comment": "OP1: texture at full resolution (both mlayers)" }, + { "intent": 2, "xlayer_map": [0, 1], "mlayer_count": [2, 1], + "comment": "OP2: texture + depth" } + ] + } + ], + "combined_tu": true, + "monotonic_output_order": false, + "output": "texture_depth_2xl_2ml_closed_nonmono.obu" +} diff --git a/cfg/xlayer/texture_depth_2xl_2ml_open_leading.json b/cfg/xlayer/texture_depth_2xl_2ml_open_leading.json new file mode 100644 index 0000000000..eed1581ca7 --- /dev/null +++ b/cfg/xlayer/texture_depth_2xl_2ml_open_leading.json @@ -0,0 +1,57 @@ +{ + "comment": "2 xlayers × 2 mlayers, open GOP (OLK + leading pictures), non-monotonic. The forward keyframe is a displayed OLK — it is an implicit output frame (decoder reorders). Frames before the OLK in display order are coded as leading pictures after the OLK in coding order. Requires lag_in_frames > 0 and non-monotonic output.", + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture.y4m", + "width": 1920, + "height": 1080, + "layer_type": "texture", + "num_embedded_layers": 2, + "scaling_mode": ["1/2", "1:1"], + "qp": 128, + "cpu_used": 5, + "lag_in_frames": 19, + "kf_max_dist": 9, + "gop_mode": "open_leading" + }, + { + "xlayer_id": 1, + "input": "depth.y4m", + "width": 1920, + "height": 1080, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5, + "lag_in_frames": 19, + "kf_max_dist": 9, + "gop_mode": "open_leading" + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "doh_constraint": true + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0], "mlayer_count": [1], + "comment": "OP0: texture at 1/2 resolution" }, + { "intent": 1, "xlayer_map": [0], "mlayer_count": [2], + "comment": "OP1: texture at full resolution" }, + { "intent": 2, "xlayer_map": [0, 1], "mlayer_count": [2, 1], + "comment": "OP2: texture + depth" } + ] + } + ], + "combined_tu": true, + "monotonic_output_order": false, + "output": "texture_depth_2xl_2ml_open_leading.obu" +} diff --git a/cfg/xlayer/texture_depth_2xl_2ml_open_sef_mono.json b/cfg/xlayer/texture_depth_2xl_2ml_open_sef_mono.json new file mode 100644 index 0000000000..62b247dfde --- /dev/null +++ b/cfg/xlayer/texture_depth_2xl_2ml_open_sef_mono.json @@ -0,0 +1,57 @@ +{ + "comment": "2 xlayers × 2 mlayers, open GOP (hidden INTRA_ONLY_FRAME + SEF), monotonic. The forward keyframe is coded as a hidden INTRA_ONLY_FRAME that does NOT reset reference buffers, enabling inter-prediction across the GOP boundary. The hidden frame is shown via SEF in display order. Requires lag_in_frames > 0.", + "xlayers": [ + { + "xlayer_id": 0, + "input": "texture.y4m", + "width": 1920, + "height": 1080, + "layer_type": "texture", + "num_embedded_layers": 2, + "scaling_mode": ["1/2", "1:1"], + "qp": 128, + "cpu_used": 5, + "lag_in_frames": 19, + "kf_max_dist": 9, + "gop_mode": "open_sef" + }, + { + "xlayer_id": 1, + "input": "depth.y4m", + "width": 1920, + "height": 1080, + "layer_type": "auxiliary", + "auxiliary_type": "depth", + "num_embedded_layers": 1, + "qp": 160, + "cpu_used": 5, + "lag_in_frames": 19, + "kf_max_dist": 9, + "gop_mode": "open_sef" + } + ], + "global_lcr": { + "enable": true, + "purpose_id": 0, + "doh_constraint": true + }, + "ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0], "mlayer_count": [1], + "comment": "OP0: texture at 1/2 resolution" }, + { "intent": 1, "xlayer_map": [0], "mlayer_count": [2], + "comment": "OP1: texture at full resolution" }, + { "intent": 2, "xlayer_map": [0, 1], "mlayer_count": [2, 1], + "comment": "OP2: texture + depth" } + ] + } + ], + "combined_tu": true, + "monotonic_output_order": true, + "output": "texture_depth_2xl_2ml_open_sef_mono.obu" +} diff --git a/cfg/xlayer/texture_depth_embedded_3ml_2ml.json b/cfg/xlayer/texture_depth_embedded_3ml_2ml.json new file mode 100644 index 0000000000..1a5d238c7a --- /dev/null +++ b/cfg/xlayer/texture_depth_embedded_3ml_2ml.json @@ -0,0 +1,46 @@ +{ + "inputs": [ + { "name": "texture", "filename": "texture.yuv", "width": 1920, "height": 1080 }, + { "name": "depth", "filename": "depth.yuv", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "texture", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "layer_type": "texture", + "color_primaries": 1, "transfer_characteristics": 1, + "matrix_coefficients": 1, "full_range_flag": 0, + "num_embedded_layers": 3, + "embedded_layers": [ + { "scaling_mode": "1/4" }, + { "scaling_mode": "1/2" }, + { "scaling_mode": "1:1" } + ], + "qp": 128, "cpu_used": 5 }, + { "xlayer_id": 1, "input_source": "texture", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "layer_type": "auxiliary", "auxiliary_type": "depth", + "color_primaries": 1, "transfer_characteristics": 1, + "matrix_coefficients": 1, "full_range_flag": 0, + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", "input_source": "texture", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "depth", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [], + "matrix_coefficients": 0, "full_range_flag": 1 } + ], + "qp": 160, "cpu_used": 5 } + ], + "ops": [{ + "ops_id": 0, "priority": 0, "intent_present": true, "ptl_present": true, + "mlayer_info_idc": 2, + "operating_points": [ + { "intent": 0, "xlayer_map": [0], "mlayer_count": [1] }, + { "intent": 1, "xlayer_map": [0], "mlayer_count": [3] }, + { "intent": 2, "xlayer_map": [0, 1], "mlayer_count": [3, 2] } + ] + }], + "output": "texture_depth_embedded.obu" +} diff --git a/common/tu_assembler.c b/common/tu_assembler.c new file mode 100644 index 0000000000..5e6fdd36fd --- /dev/null +++ b/common/tu_assembler.c @@ -0,0 +1,1631 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#include "common/tu_assembler.h" + +#include +#include +#include + +#include "avm/avm_codec.h" +#include "avm/avm_integer.h" +#include "avm/avmcx.h" +#include "avm_dsp/bitwriter_buffer.h" +#include "av2/common/level.h" +#include "av2/common/obu_util.h" + +// Ensure the buffer has room for 'needed' more bytes +static int ensure_capacity(TUAssembler *ta, size_t needed) { + size_t required = ta->size + needed; + if (required <= ta->capacity) return 0; + size_t new_cap = ta->capacity * 2; + if (new_cap < required) new_cap = required; + uint8_t *new_buf = (uint8_t *)realloc(ta->buffer, new_cap); + if (!new_buf) return -1; + ta->buffer = new_buf; + ta->capacity = new_cap; + return 0; +} + +// Append raw bytes to the assembler buffer +static int append_bytes(TUAssembler *ta, const uint8_t *data, size_t len) { + if (ensure_capacity(ta, len) != 0) return -1; + memcpy(ta->buffer + ta->size, data, len); + ta->size += len; + return 0; +} + +// Write a ULEB128-encoded size value to the buffer +static int append_uleb128(TUAssembler *ta, uint64_t value) { + uint8_t coded[10]; + size_t coded_size = 0; + if (avm_uleb_encode(value, sizeof(coded), coded, &coded_size) != 0) return -1; + return append_bytes(ta, coded, coded_size); +} + +// Parse a single OBU header byte +static void parse_obu_header_byte(uint8_t byte, ObuHeader *hdr) { + hdr->obu_header_extension_flag = (byte >> 7) & 1; + hdr->type = (OBU_TYPE)((byte >> 2) & 0x1F); + hdr->obu_tlayer_id = byte & 0x3; +} + +// Parse extension byte +static void parse_obu_ext_byte(uint8_t byte, ObuHeader *hdr) { + hdr->obu_mlayer_id = (byte >> 5) & 0x7; + hdr->obu_xlayer_id = byte & 0x1F; +} + +// Write a 2-byte OBU header with extension (sets xlayer_id) +static void write_obu_header_with_xlayer(uint8_t *dst, const ObuHeader *hdr, + int xlayer_id) { + // Byte 0: extension_flag=1, type, tlayer_id + dst[0] = (uint8_t)((1 << 7) | ((hdr->type & 0x1F) << 2) | + (hdr->obu_tlayer_id & 0x3)); + // Byte 1: mlayer_id, xlayer_id + dst[1] = (uint8_t)(((hdr->obu_mlayer_id & 0x7) << 5) | (xlayer_id & 0x1F)); +} + +int tu_assembler_init(TUAssembler *ta, const MultiXLayerConfig *mcfg) { + memset(ta, 0, sizeof(*ta)); + ta->capacity = TU_ASM_INITIAL_CAPACITY; + ta->buffer = (uint8_t *)malloc(ta->capacity); + if (!ta->buffer) return -1; + + ta->num_xlayers = mcfg->num_xlayers; + for (int i = 0; i < mcfg->num_xlayers; i++) { + ta->xlayer_ids[i] = mcfg->xlayers[i].xlayer_id; + } + + ta->msdo_enabled = mcfg->enable_msdo; + ta->num_ops_sets = mcfg->num_ops_sets; + ta->config = mcfg; + + // Populate Global LCR from config + if (mcfg->enable_global_lcr || mcfg->enable_local_lcr) { + populate_global_lcr_from_config(mcfg, &ta->global_lcr); + // In local_only mode, the Global LCR is present for stream detection + // and PTL but does not carry per-xlayer payload — Local LCRs are + // the authoritative source. + if (mcfg->enable_local_lcr && mcfg->local_lcr_mode == 1) { + ta->global_lcr.lcr_global_payload_present_flag = 0; + } + } + + // Populate OPS from config + for (int s = 0; s < mcfg->num_ops_sets; s++) { + populate_ops_from_config(&mcfg->ops_sets[s], GLOBAL_XLAYER_ID, mcfg, + &ta->ops_list[s]); + } + + // Populate Atlas from config + if (mcfg->enable_atlas) { + populate_atlas_from_config(mcfg, &ta->atlas_info); + } + + return 0; +} + +void tu_assembler_free(TUAssembler *ta) { + if (ta->buffer) { + free(ta->buffer); + ta->buffer = NULL; + } + ta->size = 0; + ta->capacity = 0; +} + +int tu_assembler_write_td(TUAssembler *ta) { + // Write a minimal 1-byte TD OBU: [size=1][header_byte] + // TD type = OBU_TEMPORAL_DELIMITER = 2 + // Header: ext=0, type=2, tlayer=0 => (2 << 2) = 0x08 + uint8_t td[2]; + td[0] = 1; // ULEB128 size = 1 (just the header byte) + td[1] = 0x08; // OBU_TEMPORAL_DELIMITER << 2 + return append_bytes(ta, td, 2); +} + +// Helper: write per-xlayer info (mirrors write_lcr_xlayer_info in +// bitstream_lcr.c) +static void tu_asm_write_lcr_xlayer_info(LCRXLayerInfo *xinfo, + int atlas_id_present, + struct avm_write_bit_buffer *wb) { + avm_wb_write_bit(wb, xinfo->lcr_rep_info_present_flag); + avm_wb_write_bit(wb, xinfo->lcr_xlayer_purpose_present_flag); + avm_wb_write_bit(wb, xinfo->lcr_xlayer_color_info_present_flag); + avm_wb_write_bit(wb, xinfo->lcr_embedded_layer_info_present_flag); + + if (xinfo->lcr_rep_info_present_flag) { + avm_wb_write_uvlc(wb, xinfo->rep_params.lcr_max_pic_width); + avm_wb_write_uvlc(wb, xinfo->rep_params.lcr_max_pic_height); + avm_wb_write_bit(wb, xinfo->rep_params.lcr_format_info_present_flag); + avm_wb_write_bit(wb, xinfo->crop_win.crop_window_present_flag); + if (xinfo->rep_params.lcr_format_info_present_flag) { + avm_wb_write_uvlc(wb, xinfo->rep_params.lcr_bit_depth_idc); + avm_wb_write_uvlc(wb, xinfo->rep_params.lcr_chroma_format_idc); + } + if (xinfo->crop_win.crop_window_present_flag) { + avm_wb_write_uvlc(wb, xinfo->crop_win.crop_win_left_offset); + avm_wb_write_uvlc(wb, xinfo->crop_win.crop_win_right_offset); + avm_wb_write_uvlc(wb, xinfo->crop_win.crop_win_top_offset); + avm_wb_write_uvlc(wb, xinfo->crop_win.crop_win_bottom_offset); + } + } + + if (xinfo->lcr_xlayer_purpose_present_flag) + avm_wb_write_literal(wb, xinfo->lcr_xlayer_purpose_id, 7); + + if (xinfo->lcr_xlayer_color_info_present_flag) { + struct XLayerColorInfo *col = &xinfo->xlayer_col_params; + avm_wb_write_rice_golomb(wb, col->layer_color_description_idc, 2); + if (col->layer_color_description_idc == 0) { + avm_wb_write_literal(wb, col->layer_color_primaries, 8); + avm_wb_write_literal(wb, col->layer_transfer_characteristics, 8); + avm_wb_write_literal(wb, col->layer_matrix_coefficients, 8); + } + avm_wb_write_bit(wb, col->layer_full_range_flag); + } + + // Byte alignment after per-xlayer flags/info + avm_wb_write_literal(wb, 0, (8 - wb->bit_offset % 8) % 8); + + if (xinfo->lcr_embedded_layer_info_present_flag) { + struct EmbeddedLayerInfo *ml = &xinfo->mlayer_params; + avm_wb_write_literal(wb, ml->lcr_mlayer_map, MAX_NUM_MLAYERS); + for (int m = 0; m < MAX_NUM_MLAYERS; m++) { + if (ml->lcr_mlayer_map & (1 << m)) { + avm_wb_write_literal(wb, ml->lcr_tlayer_map[m], MAX_NUM_TLAYERS); + if (atlas_id_present) { + avm_wb_write_literal(wb, ml->lcr_layer_atlas_segment_id[m], 8); + avm_wb_write_literal(wb, ml->lcr_priority_order[m], 8); + avm_wb_write_literal(wb, ml->lcr_rendering_method[m], 8); + } + avm_wb_write_literal(wb, ml->lcr_layer_type[m], 8); + if (ml->lcr_layer_type[m] == AUX_LAYER) { + avm_wb_write_literal(wb, ml->lcr_auxiliary_type[m], 8); + } + avm_wb_write_literal(wb, ml->lcr_view_type[m], 8); + if (ml->lcr_view_type[m] == VIEW_EXPLICIT) { + avm_wb_write_literal(wb, ml->lcr_view_id[m], 8); + } + if (m > 0) { + avm_wb_write_literal(wb, ml->lcr_dependent_layer_map[m], m); + } + avm_wb_write_bit(wb, ml->lcr_same_sh_max_resolution_flag[m]); + if (!ml->lcr_same_sh_max_resolution_flag[m]) { + avm_wb_write_uvlc(wb, ml->lcr_max_expected_width[m]); + avm_wb_write_uvlc(wb, ml->lcr_max_expected_height[m]); + } + // Byte alignment per mlayer + int remaining = wb->bit_offset % 8; + if (remaining != 0) avm_wb_write_literal(wb, 0, 8 - remaining); + } + } + } else { + if (atlas_id_present) { + avm_wb_write_literal(wb, xinfo->lcr_xlayer_atlas_segment_id, 8); + avm_wb_write_literal(wb, xinfo->lcr_xlayer_priority_order, 8); + avm_wb_write_literal(wb, xinfo->lcr_xlayer_rendering_method, 8); + } + } +} + +// Calculate lcr_data_size for a single xlayer payload +static uint32_t tu_asm_calculate_lcr_data_size( + GlobalLayerConfigurationRecord *glcr, int i) { + uint8_t temp[2048]; + struct avm_write_bit_buffer wb = { temp, 0 }; + int n = glcr->LcrXLayerID[i]; + + if (glcr->lcr_dependent_xlayers_flag && n > 0) + avm_wb_write_unsigned_literal(&wb, glcr->lcr_num_dependent_xlayer_map[i], + n); + + tu_asm_write_lcr_xlayer_info(&glcr->xlayer_info[i], + glcr->lcr_global_atlas_id_present_flag, &wb); + + return (wb.bit_offset + 7) / 8; +} + +int tu_assembler_write_global_lcr(TUAssembler *ta) { + // Spec-compliant Global LCR OBU serialization matching + // write_lcr_global_info() in bitstream_lcr.c. + uint8_t lcr_buf[4096]; + struct avm_write_bit_buffer wb = { lcr_buf, 0 }; + GlobalLayerConfigurationRecord *glcr = &ta->global_lcr; + + // OBU header with extension (xlayer_id = GLOBAL_XLAYER_ID) + avm_wb_write_bit(&wb, 1); // extension flag + avm_wb_write_literal(&wb, OBU_LAYER_CONFIGURATION_RECORD, 5); + avm_wb_write_literal(&wb, 0, 2); // tlayer + avm_wb_write_literal(&wb, 0, 3); // mlayer + avm_wb_write_literal(&wb, GLOBAL_XLAYER_ID, 5); // xlayer + + // Global LCR payload — matches write_lcr_global_info() exactly + avm_wb_write_literal(&wb, glcr->lcr_global_config_record_id, 3); + avm_wb_write_literal(&wb, glcr->lcr_xlayer_map, 31); + avm_wb_write_bit(&wb, glcr->lcr_aggregate_info_present_flag); + avm_wb_write_bit(&wb, glcr->lcr_seq_profile_tier_level_info_present_flag); + avm_wb_write_bit(&wb, glcr->lcr_global_payload_present_flag); + avm_wb_write_bit(&wb, glcr->lcr_dependent_xlayers_flag); + avm_wb_write_bit(&wb, glcr->lcr_global_atlas_id_present_flag); + avm_wb_write_literal(&wb, glcr->lcr_global_purpose_id, 7); + avm_wb_write_bit(&wb, glcr->lcr_doh_constraint_flag); + avm_wb_write_bit(&wb, glcr->lcr_enforce_tile_alignment_flag); + if (glcr->lcr_global_atlas_id_present_flag) + avm_wb_write_literal(&wb, glcr->lcr_global_atlas_id, 3); + else + avm_wb_write_literal(&wb, 0, 3); // reserved + avm_wb_write_literal(&wb, 0, 5); // reserved + + if (glcr->lcr_aggregate_info_present_flag) { + avm_wb_write_literal(&wb, glcr->aggregate_ptl.lcr_config_idc, 6); + avm_wb_write_literal(&wb, glcr->aggregate_ptl.lcr_aggregate_level_idx, 5); + avm_wb_write_bit(&wb, glcr->aggregate_ptl.lcr_max_tier_flag); + avm_wb_write_literal(&wb, glcr->aggregate_ptl.lcr_max_interop, 4); + } + + if (glcr->lcr_seq_profile_tier_level_info_present_flag) { + for (int i = 0; i < glcr->LcrMaxNumXLayerCount; i++) { + avm_wb_write_literal(&wb, glcr->seq_ptl[i].lcr_seq_profile_idc, 5); + avm_wb_write_literal(&wb, glcr->seq_ptl[i].lcr_max_level_idx, 5); + avm_wb_write_bit(&wb, glcr->seq_ptl[i].lcr_tier_flag); + avm_wb_write_literal(&wb, glcr->seq_ptl[i].lcr_max_mlayer_count, 3); + avm_wb_write_literal(&wb, glcr->seq_ptl[i].lcr_reserved_2bits, 2); + } + } + + if (glcr->lcr_global_payload_present_flag) { + // Pre-calculate data sizes + for (int i = 0; i < glcr->LcrMaxNumXLayerCount; i++) { + glcr->lcr_data_size[i] = tu_asm_calculate_lcr_data_size(glcr, i); + } + for (int i = 0; i < glcr->LcrMaxNumXLayerCount; i++) { + avm_wb_write_uleb(&wb, glcr->lcr_data_size[i]); + // Write payload + const uint32_t start_position = wb.bit_offset; + int n = glcr->LcrXLayerID[i]; + if (glcr->lcr_dependent_xlayers_flag && n > 0) + avm_wb_write_unsigned_literal(&wb, + glcr->lcr_num_dependent_xlayer_map[i], n); + tu_asm_write_lcr_xlayer_info(&glcr->xlayer_info[i], + glcr->lcr_global_atlas_id_present_flag, &wb); + // Pad remaining bits to match lcr_data_size + const uint32_t parsed_bits = wb.bit_offset - start_position; + const int remaining = + (int)(glcr->lcr_data_size[i] * 8) - (int)parsed_bits; + for (int j = 0; j < remaining; j++) avm_wb_write_bit(&wb, 0); + } + } + + // Extension flag + trailing bits + avm_wb_write_bit(&wb, 0); // lcr_extension_present_flag + if (avm_wb_is_byte_aligned(&wb)) + avm_wb_write_literal(&wb, 0x80, 8); + else + avm_wb_write_bit(&wb, 1); + + uint32_t obu_payload_size = avm_wb_bytes_written(&wb); + + // Write: [uleb128 total size][obu data] + if (append_uleb128(ta, (uint64_t)obu_payload_size) != 0) return -1; + return append_bytes(ta, lcr_buf, obu_payload_size); +} + +int tu_assembler_write_local_lcr(TUAssembler *ta, int xlayer_idx) { + // Spec-compliant Local LCR OBU serialization matching + // write_lcr_local_info() in bitstream_lcr.c. + GlobalLayerConfigurationRecord *glcr = &ta->global_lcr; + + if (xlayer_idx < 0 || xlayer_idx >= glcr->LcrMaxNumXLayerCount) return -1; + + int xlayer_id = glcr->LcrXLayerID[xlayer_idx]; + uint8_t lcr_buf[4096]; + struct avm_write_bit_buffer wb = { lcr_buf, 0 }; + + // OBU header with extension (xlayer_id = per-xlayer, NOT global) + avm_wb_write_bit(&wb, 1); // extension flag + avm_wb_write_literal(&wb, OBU_LAYER_CONFIGURATION_RECORD, 5); + avm_wb_write_literal(&wb, 0, 2); // tlayer + avm_wb_write_literal(&wb, 0, 3); // mlayer + avm_wb_write_literal(&wb, xlayer_id & 0x1F, 5); // xlayer + + // Local LCR payload — matches write_lcr_local_info() + avm_wb_write_literal(&wb, glcr->lcr_global_config_record_id, + 3); // lcr_global_id + avm_wb_write_literal(&wb, 1, 3); // lcr_local_id (matches encoder.c:938) + avm_wb_write_bit(&wb, 1); // lcr_profile_tier_level_info_present_flag + avm_wb_write_bit(&wb, 0); // lcr_local_atlas_id_present_flag + + // PTL — reuse same data as Global LCR seq_ptl for this xlayer + avm_wb_write_literal(&wb, glcr->seq_ptl[xlayer_idx].lcr_seq_profile_idc, 5); + avm_wb_write_literal(&wb, glcr->seq_ptl[xlayer_idx].lcr_max_level_idx, 5); + avm_wb_write_bit(&wb, glcr->seq_ptl[xlayer_idx].lcr_tier_flag); + avm_wb_write_literal(&wb, glcr->seq_ptl[xlayer_idx].lcr_max_mlayer_count, 3); + avm_wb_write_literal(&wb, 0, 2); // lcr_reserved_2bits + + // Reserved bits (atlas_id not present) + avm_wb_write_literal(&wb, 0, 3); // lcr_reserved_zero_3bits + avm_wb_write_literal(&wb, 0, 5); // lcr_reserved_zero_5bits + + // xlayer_info — identical data to Global LCR to pass decoder validation + tu_asm_write_lcr_xlayer_info(&glcr->xlayer_info[xlayer_idx], 0, &wb); + + // Extension flag + trailing bits + avm_wb_write_bit(&wb, 0); // lcr_extension_present_flag + if (avm_wb_is_byte_aligned(&wb)) + avm_wb_write_literal(&wb, 0x80, 8); + else + avm_wb_write_bit(&wb, 1); + + uint32_t obu_payload_size = avm_wb_bytes_written(&wb); + + if (append_uleb128(ta, (uint64_t)obu_payload_size) != 0) return -1; + return append_bytes(ta, lcr_buf, obu_payload_size); +} + +int tu_assembler_write_msdo(TUAssembler *ta) { + if (!ta->msdo_enabled) return 0; + + // Write MSDO OBU — ported from stream_multiplexer.cc + uint8_t msdo_buf[128]; + struct avm_write_bit_buffer wb = { msdo_buf, 0 }; + + // OBU header with extension (xlayer_id = GLOBAL_XLAYER_ID) + avm_wb_write_bit(&wb, 1); // extension flag + avm_wb_write_literal(&wb, OBU_MULTI_STREAM_DECODER_OPERATION, 5); + avm_wb_write_literal(&wb, 0, 2); // tlayer + avm_wb_write_literal(&wb, 0, 3); // mlayer + avm_wb_write_literal(&wb, GLOBAL_XLAYER_ID, 5); // xlayer + + // MSDO payload + avm_wb_write_literal(&wb, ta->num_xlayers - 2, 3); // num_streams - 2 + avm_wb_write_literal(&wb, MAIN_420_10_IP1, PROFILE_BITS); + avm_wb_write_literal(&wb, SEQ_LEVEL_4_0, LEVEL_BITS); + avm_wb_write_bit(&wb, 0); // tier + + // Even allocation flag + avm_wb_write_bit(&wb, 1); // multistream_even_allocation_flag + + // Per-stream info + for (int i = 0; i < ta->num_xlayers; i++) { + avm_wb_write_literal(&wb, ta->xlayer_ids[i], XLAYER_BITS); + avm_wb_write_literal(&wb, 0, PROFILE_BITS); + avm_wb_write_literal(&wb, SEQ_LEVEL_4_0, LEVEL_BITS); + avm_wb_write_bit(&wb, 0); // tier + } + + // doh_constraint_flag + avm_wb_write_bit(&wb, ta->config->lcr_doh_constraint_flag); + + // Trailing bit + if ((wb.bit_offset % 8) == 0) { + avm_wb_write_literal(&wb, 0x80, 8); + } else { + avm_wb_write_bit(&wb, 1); + while ((wb.bit_offset % 8) != 0) avm_wb_write_bit(&wb, 0); + } + + uint32_t obu_size = avm_wb_bytes_written(&wb); + + if (append_uleb128(ta, (uint64_t)obu_size) != 0) return -1; + return append_bytes(ta, msdo_buf, obu_size); +} + +// Compute ops_data_size for a single operating point. +// Mirrors calculate_ops_data_size() in bitstream_ops.c. +static uint32_t tu_asm_calculate_ops_data_size(const OperatingPointSet *ops, + int obu_xlayer_id, + int op_index) { + uint8_t temp_buffer[1024]; + struct avm_write_bit_buffer temp_wb = { temp_buffer, 0 }; + const OperatingPoint *op = &ops->op[op_index]; + + if (ops->ops_intent_present_flag) + avm_wb_write_literal(&temp_wb, op->ops_intent_op, 7); + + if (ops->ops_ptl_present_flag) { + if (obu_xlayer_id == GLOBAL_XLAYER_ID) { + avm_wb_write_literal(&temp_wb, op->ops_config_idc, MULTI_SEQ_CONFIG_BITS); + avm_wb_write_literal(&temp_wb, op->ops_aggregate_level_idx, LEVEL_BITS); + avm_wb_write_bit(&temp_wb, op->ops_max_tier_flag); + avm_wb_write_literal(&temp_wb, op->ops_max_interop, INTEROP_BITS); + } else { + avm_wb_write_literal(&temp_wb, op->ops_seq_profile_idc[obu_xlayer_id], + PROFILE_BITS); + avm_wb_write_literal(&temp_wb, op->ops_level_idx[obu_xlayer_id], + LEVEL_BITS); + avm_wb_write_bit(&temp_wb, op->ops_tier_flag[obu_xlayer_id]); + avm_wb_write_literal(&temp_wb, op->ops_mlayer_count[obu_xlayer_id], 3); + avm_wb_write_literal(&temp_wb, 0, 2); + } + } + + if (ops->ops_color_info_present_flag) { + // Simplified: write ops_color_description_idc=1 (unspecified, no payload) + avm_wb_write_rice_golomb(&temp_wb, op->color_info.ops_color_description_idc, + 2); + if (op->color_info.ops_color_description_idc == 0) { + avm_wb_write_literal(&temp_wb, op->color_info.ops_color_primaries, 8); + avm_wb_write_literal(&temp_wb, + op->color_info.ops_transfer_characteristics, 8); + avm_wb_write_literal(&temp_wb, op->color_info.ops_matrix_coefficients, 8); + } + avm_wb_write_bit(&temp_wb, op->color_info.ops_full_range_flag); + } + + avm_wb_write_bit(&temp_wb, + op->ops_decoder_model_info_for_this_op_present_flag); + + int ops_initial_display_delay_present_flag = + op->ops_initial_display_delay != BUFFER_POOL_MAX_SIZE; + avm_wb_write_bit(&temp_wb, ops_initial_display_delay_present_flag); + if (ops_initial_display_delay_present_flag) { + avm_wb_write_literal(&temp_wb, op->ops_initial_display_delay - 1, 4); + } + + if (obu_xlayer_id == GLOBAL_XLAYER_ID) { + avm_wb_write_literal(&temp_wb, op->ops_xlayer_map, MAX_NUM_XLAYERS - 1); + for (int j = 0; j < MAX_NUM_XLAYERS - 1; j++) { + if (op->ops_xlayer_map & (1 << j)) { + if (ops->ops_ptl_present_flag) { + avm_wb_write_literal(&temp_wb, op->ops_seq_profile_idc[j], + PROFILE_BITS); + avm_wb_write_literal(&temp_wb, op->ops_level_idx[j], LEVEL_BITS); + avm_wb_write_bit(&temp_wb, op->ops_tier_flag[j]); + avm_wb_write_literal(&temp_wb, op->ops_mlayer_count[j], 3); + avm_wb_write_literal(&temp_wb, 0, 2); + } + if (ops->ops_mlayer_info_idc == 1) { + avm_wb_write_literal(&temp_wb, op->mlayer_info.ops_mlayer_map[j], + MAX_NUM_MLAYERS); + for (int m = 0; m < 8; m++) { + if (op->mlayer_info.ops_mlayer_map[j] & (1 << m)) { + avm_wb_write_literal(&temp_wb, + op->mlayer_info.ops_tlayer_map[j][m], + MAX_NUM_TLAYERS); + } + } + } else if (ops->ops_mlayer_info_idc == 2) { + avm_wb_write_bit(&temp_wb, op->ops_mlayer_explicit_info_flag[j]); + if (op->ops_mlayer_explicit_info_flag[j]) { + avm_wb_write_literal(&temp_wb, op->mlayer_info.ops_mlayer_map[j], + MAX_NUM_MLAYERS); + for (int m = 0; m < 8; m++) { + if (op->mlayer_info.ops_mlayer_map[j] & (1 << m)) { + avm_wb_write_literal(&temp_wb, + op->mlayer_info.ops_tlayer_map[j][m], + MAX_NUM_TLAYERS); + } + } + } else { + avm_wb_write_literal(&temp_wb, op->ops_embedded_ops_id[j], 4); + avm_wb_write_literal(&temp_wb, op->ops_embedded_op_index[j], 3); + } + } + } + } + } else { + avm_wb_write_literal(&temp_wb, + op->mlayer_info.ops_mlayer_map[obu_xlayer_id], + MAX_NUM_MLAYERS); + for (int m = 0; m < 8; m++) { + if (op->mlayer_info.ops_mlayer_map[obu_xlayer_id] & (1 << m)) { + avm_wb_write_literal(&temp_wb, + op->mlayer_info.ops_tlayer_map[obu_xlayer_id][m], + MAX_NUM_TLAYERS); + } + } + } + + // Byte alignment + avm_wb_write_literal(&temp_wb, 0, (8 - temp_wb.bit_offset % 8) % 8); + return (temp_wb.bit_offset + 7) / 8; +} + +int tu_assembler_write_ops(TUAssembler *ta, int xlayer_id) { + // Spec-compliant OPS OBU serialization matching + // av2_write_operating_point_set_obu() in bitstream_ops.c. + + for (int s = 0; s < ta->num_ops_sets; s++) { + const OperatingPointSet *ops = &ta->ops_list[s]; + if (!ops->valid) continue; + + int obu_xlayer_id = (xlayer_id >= 0) ? xlayer_id : ops->obu_xlayer_id; + + uint8_t ops_buf[2048]; + struct avm_write_bit_buffer wb = { ops_buf, 0 }; + + // OBU header with extension + avm_wb_write_bit(&wb, 1); // extension flag + avm_wb_write_literal(&wb, OBU_OPERATING_POINT_SET, 5); + avm_wb_write_literal(&wb, 0, 2); // tlayer + avm_wb_write_literal(&wb, 0, 3); // mlayer + avm_wb_write_literal(&wb, obu_xlayer_id & 0x1F, 5); // xlayer + + // OPS payload — mirrors av2_write_operating_point_set_obu() + avm_wb_write_bit(&wb, ops->ops_reset_flag); + avm_wb_write_literal(&wb, ops->ops_id, OPS_ID_BITS); + avm_wb_write_literal(&wb, ops->ops_cnt, OPS_COUNT_BITS); + + if (ops->ops_cnt > 0) { + avm_wb_write_literal(&wb, ops->ops_priority, 4); + avm_wb_write_literal(&wb, ops->ops_intent, 7); + avm_wb_write_bit(&wb, ops->ops_intent_present_flag); + avm_wb_write_bit(&wb, ops->ops_ptl_present_flag); + avm_wb_write_bit(&wb, ops->ops_color_info_present_flag); + if (obu_xlayer_id == GLOBAL_XLAYER_ID) { + avm_wb_write_literal(&wb, ops->ops_mlayer_info_idc, 2); + } else { + avm_wb_write_literal(&wb, 0, 2); + } + } + + for (int p = 0; p < ops->ops_cnt; p++) { + OperatingPoint *op = (OperatingPoint *)&ops->op[p]; + + // Calculate and write ops_data_size + uint32_t data_size = + tu_asm_calculate_ops_data_size(ops, obu_xlayer_id, p); + avm_wb_write_uleb(&wb, data_size); + + if (ops->ops_intent_present_flag) + avm_wb_write_literal(&wb, op->ops_intent_op, 7); + + if (ops->ops_ptl_present_flag) { + if (obu_xlayer_id == GLOBAL_XLAYER_ID) { + avm_wb_write_literal(&wb, op->ops_config_idc, MULTI_SEQ_CONFIG_BITS); + avm_wb_write_literal(&wb, op->ops_aggregate_level_idx, LEVEL_BITS); + avm_wb_write_bit(&wb, op->ops_max_tier_flag); + avm_wb_write_literal(&wb, op->ops_max_interop, INTEROP_BITS); + } else { + avm_wb_write_literal(&wb, op->ops_seq_profile_idc[obu_xlayer_id], + PROFILE_BITS); + avm_wb_write_literal(&wb, op->ops_level_idx[obu_xlayer_id], + LEVEL_BITS); + avm_wb_write_bit(&wb, op->ops_tier_flag[obu_xlayer_id]); + avm_wb_write_literal(&wb, op->ops_mlayer_count[obu_xlayer_id], 3); + avm_wb_write_literal(&wb, 0, 2); + } + } + + if (ops->ops_color_info_present_flag) { + avm_wb_write_rice_golomb(&wb, op->color_info.ops_color_description_idc, + 2); + if (op->color_info.ops_color_description_idc == 0) { + avm_wb_write_literal(&wb, op->color_info.ops_color_primaries, 8); + avm_wb_write_literal(&wb, op->color_info.ops_transfer_characteristics, + 8); + avm_wb_write_literal(&wb, op->color_info.ops_matrix_coefficients, 8); + } + avm_wb_write_bit(&wb, op->color_info.ops_full_range_flag); + } + + avm_wb_write_bit(&wb, + op->ops_decoder_model_info_for_this_op_present_flag); + + int ops_initial_display_delay_present_flag = + op->ops_initial_display_delay != BUFFER_POOL_MAX_SIZE; + avm_wb_write_bit(&wb, ops_initial_display_delay_present_flag); + if (ops_initial_display_delay_present_flag) { + avm_wb_write_literal(&wb, op->ops_initial_display_delay - 1, 4); + } + + if (obu_xlayer_id == GLOBAL_XLAYER_ID) { + avm_wb_write_literal(&wb, op->ops_xlayer_map, MAX_NUM_XLAYERS - 1); + for (int j = 0; j < MAX_NUM_XLAYERS - 1; j++) { + if (op->ops_xlayer_map & (1 << j)) { + if (ops->ops_ptl_present_flag) { + avm_wb_write_literal(&wb, op->ops_seq_profile_idc[j], + PROFILE_BITS); + avm_wb_write_literal(&wb, op->ops_level_idx[j], LEVEL_BITS); + avm_wb_write_bit(&wb, op->ops_tier_flag[j]); + avm_wb_write_literal(&wb, op->ops_mlayer_count[j], 3); + avm_wb_write_literal(&wb, 0, 2); + } + if (ops->ops_mlayer_info_idc == 1) { + avm_wb_write_literal(&wb, op->mlayer_info.ops_mlayer_map[j], + MAX_NUM_MLAYERS); + for (int m = 0; m < 8; m++) { + if (op->mlayer_info.ops_mlayer_map[j] & (1 << m)) { + avm_wb_write_literal(&wb, + op->mlayer_info.ops_tlayer_map[j][m], + MAX_NUM_TLAYERS); + } + } + } else if (ops->ops_mlayer_info_idc == 2) { + avm_wb_write_bit(&wb, op->ops_mlayer_explicit_info_flag[j]); + if (op->ops_mlayer_explicit_info_flag[j]) { + avm_wb_write_literal(&wb, op->mlayer_info.ops_mlayer_map[j], + MAX_NUM_MLAYERS); + for (int m = 0; m < 8; m++) { + if (op->mlayer_info.ops_mlayer_map[j] & (1 << m)) { + avm_wb_write_literal(&wb, + op->mlayer_info.ops_tlayer_map[j][m], + MAX_NUM_TLAYERS); + } + } + } else { + avm_wb_write_literal(&wb, op->ops_embedded_ops_id[j], 4); + avm_wb_write_literal(&wb, op->ops_embedded_op_index[j], 3); + } + } + } + } + } else { + avm_wb_write_literal(&wb, op->mlayer_info.ops_mlayer_map[obu_xlayer_id], + MAX_NUM_MLAYERS); + for (int m = 0; m < 8; m++) { + if (op->mlayer_info.ops_mlayer_map[obu_xlayer_id] & (1 << m)) { + avm_wb_write_literal( + &wb, op->mlayer_info.ops_tlayer_map[obu_xlayer_id][m], + MAX_NUM_TLAYERS); + } + } + } + + // Byte alignment at end of each operating point + avm_wb_write_literal(&wb, 0, (8 - wb.bit_offset % 8) % 8); + } + + // Extension flag + avm_wb_write_bit(&wb, 0); + + // Trailing bits + if (avm_wb_is_byte_aligned(&wb)) { + avm_wb_write_literal(&wb, 0x80, 8); + } else { + avm_wb_write_bit(&wb, 1); + } + + uint32_t obu_size = avm_wb_bytes_written(&wb); + if (append_uleb128(ta, (uint64_t)obu_size) != 0) return -1; + if (append_bytes(ta, ops_buf, obu_size) != 0) return -1; + } + + return 0; +} + +int tu_assembler_write_atlas(TUAssembler *ta) { + if (!ta->config->enable_atlas) return 0; + + AtlasSegmentInfo *atlas = &ta->atlas_info; + if (!atlas->valid) return 0; + + uint8_t atlas_buf[4096]; + struct avm_write_bit_buffer wb = { atlas_buf, 0 }; + + // OBU header with extension (xlayer_id = GLOBAL_XLAYER_ID) + avm_wb_write_bit(&wb, 1); // extension flag + avm_wb_write_literal(&wb, OBU_ATLAS_SEGMENT, 5); + avm_wb_write_literal(&wb, 0, 2); // tlayer + avm_wb_write_literal(&wb, 0, 3); // mlayer + avm_wb_write_literal(&wb, GLOBAL_XLAYER_ID, 5); // xlayer + + // Atlas payload — mirrors av2_write_atlas_segment_info_obu() + avm_wb_write_literal(&wb, atlas->atlas_segment_id, 3); + avm_wb_write_uvlc(&wb, atlas->atlas_segment_mode_idc); + + int num_segments = 0; + if (atlas->atlas_segment_mode_idc == ENHANCED_ATLAS) { + // Write region info + struct AtlasRegionInfo *reg = &atlas->ats_reg_params; + avm_wb_write_uvlc(&wb, reg->ats_num_region_columns_minus_1); + avm_wb_write_uvlc(&wb, reg->ats_num_region_rows_minus_1); + avm_wb_write_bit(&wb, reg->ats_uniform_spacing_flag); + if (!reg->ats_uniform_spacing_flag) { + for (int i = 0; i <= reg->ats_num_region_columns_minus_1; i++) + avm_wb_write_uvlc(&wb, reg->ats_column_width_minus_1[i]); + for (int i = 0; i <= reg->ats_num_region_rows_minus_1; i++) + avm_wb_write_uvlc(&wb, reg->ats_row_height_minus_1[i]); + } else { + avm_wb_write_uvlc(&wb, reg->ats_region_width_minus_1); + avm_wb_write_uvlc(&wb, reg->ats_region_height_minus_1); + } + + // Write region to segment mapping + struct AtlasRegionToSegmentMapping *map = &atlas->ats_reg_seg_map; + avm_wb_write_bit(&wb, map->ats_single_region_per_atlas_segment_flag); + if (!map->ats_single_region_per_atlas_segment_flag) { + avm_wb_write_uvlc(&wb, map->ats_num_atlas_segments_minus_1); + int ns = map->ats_num_atlas_segments_minus_1 + 1; + for (int i = 0; i < ns; i++) { + avm_wb_write_uvlc(&wb, map->ats_top_left_region_column[i]); + avm_wb_write_uvlc(&wb, map->ats_top_left_region_row[i]); + avm_wb_write_uvlc(&wb, map->ats_bottom_right_region_column_offset[i]); + avm_wb_write_uvlc(&wb, map->ats_bottom_right_region_row_offset[i]); + } + num_segments = ns; + } else { + num_segments = reg->NumRegionsInAtlas; + map->ats_num_atlas_segments_minus_1 = num_segments - 1; + } + } else if (atlas->atlas_segment_mode_idc == MULTISTREAM_ATLAS) { + // Write basic info for multistream + struct AtlasBasicInfo *basic = &atlas->ats_basic_info_s; + avm_wb_write_bit(&wb, basic->ats_stream_id_present); + avm_wb_write_uvlc(&wb, basic->ats_atlas_width); + avm_wb_write_uvlc(&wb, basic->ats_atlas_height); + avm_wb_write_uvlc(&wb, basic->ats_num_atlas_segments_minus_1); + + int ns = basic->ats_num_atlas_segments_minus_1 + 1; + for (int i = 0; i < ns; i++) { + if (basic->ats_stream_id_present) + avm_wb_write_literal(&wb, basic->ats_input_stream_id[i], 5); + avm_wb_write_uvlc(&wb, basic->ats_segment_top_left_pos_x[i]); + avm_wb_write_uvlc(&wb, basic->ats_segment_top_left_pos_y[i]); + avm_wb_write_uvlc(&wb, basic->ats_segment_width[i]); + avm_wb_write_uvlc(&wb, basic->ats_segment_height[i]); + } + num_segments = ns; + } + + // Label segment info + avm_wb_write_bit(&wb, + atlas->ats_label_seg.ats_signalled_atlas_segment_ids_flag); + if (atlas->ats_label_seg.ats_signalled_atlas_segment_ids_flag) { + for (int i = 0; i < num_segments; i++) { + avm_wb_write_literal(&wb, atlas->ats_label_seg.ats_atlas_segment_id[i], + ATLAS_LABEL_SEG_ID_BITS); + } + } + + // Extension + trailing bits + avm_wb_write_bit(&wb, 0); // ats_extension_present_flag + if (avm_wb_is_byte_aligned(&wb)) + avm_wb_write_literal(&wb, 0x80, 8); + else + avm_wb_write_bit(&wb, 1); + + uint32_t obu_size = avm_wb_bytes_written(&wb); + if (append_uleb128(ta, (uint64_t)obu_size) != 0) return -1; + return append_bytes(ta, atlas_buf, obu_size); +} + +int tu_assembler_append_xlayer_obus(TUAssembler *ta, int xlayer_id, + const uint8_t *data, size_t size) { + // Parse OBUs from per-xlayer encoder output and rewrite headers + // with the specified xlayer_id. Skip TDs and structural OBUs + // (the assembler writes those globally). + size_t consumed = 0; + + while (consumed < size) { + size_t remaining = size - consumed; + size_t length_field_size = 0; + uint64_t obu_total_size = 0; + + // Read OBU total size (ULEB128) + if (avm_uleb_decode(data + consumed, remaining, &obu_total_size, + &length_field_size) != 0) { + fprintf(stderr, "OBU size parsing failed at offset %zu\n", consumed); + return -1; + } + + if (obu_total_size == 0 || + consumed + length_field_size + obu_total_size > size) { + break; + } + + // Parse OBU header + const uint8_t *obu_start = data + consumed + length_field_size; + ObuHeader hdr; + memset(&hdr, 0, sizeof(hdr)); + parse_obu_header_byte(obu_start[0], &hdr); + + int obu_header_size = 1; + if (hdr.obu_header_extension_flag) { + parse_obu_ext_byte(obu_start[1], &hdr); + obu_header_size = 2; + } + + consumed += length_field_size + (size_t)obu_total_size; + + // Skip TD OBUs — the assembler writes a single global TD + if (hdr.type == OBU_TEMPORAL_DELIMITER) continue; + + // Skip structural OBUs — the assembler generates global versions + if (hdr.type == OBU_MULTI_STREAM_DECODER_OPERATION) continue; + if (hdr.type == OBU_LAYER_CONFIGURATION_RECORD) continue; + if (hdr.type == OBU_OPERATING_POINT_SET) continue; + if (hdr.type == OBU_ATLAS_SEGMENT) continue; + + // Rewrite OBU header with xlayer_id and recalculate size + // New header is always 2 bytes (extension flag set) + uint8_t new_header[2]; + write_obu_header_with_xlayer(new_header, &hdr, xlayer_id); + + // Payload is everything after the original header + const uint8_t *payload = obu_start + obu_header_size; + size_t payload_size = (size_t)obu_total_size - (size_t)obu_header_size; + + // New OBU total size = 2 (header) + payload_size + uint64_t new_obu_total_size = 2 + payload_size; + + // Write: [uleb128 new total size][2-byte header][payload] + if (append_uleb128(ta, new_obu_total_size) != 0) return -1; + if (append_bytes(ta, new_header, 2) != 0) return -1; + if (payload_size > 0) { + if (append_bytes(ta, payload, payload_size) != 0) return -1; + } + } + + return 0; +} + +int tu_assembler_flush(TUAssembler *ta, FILE *outfile) { + if (ta->size == 0) return 0; + size_t written = fwrite(ta->buffer, 1, ta->size, outfile); + if (written != ta->size) { + fprintf(stderr, "Error: failed to write TU (%zu of %zu bytes)\n", written, + ta->size); + return -1; + } + ta->size = 0; + return 0; +} + +// Write structural OBUs (LCR, OPS, Atlas) into the assembler buffer. +// Called at the start of a TU when first_output or a keyframe is present. +// OBU order per spec: Global config (MSDO, Global LCR, Global OPS, Global +// Atlas) then per-xlayer data with Local LCR preceding each xlayer's OBUs. +// Local LCRs are NOT emitted here; they are emitted per-xlayer in the caller. +void tu_assembler_write_structural_obus(TUAssembler *ta, + const MultiXLayerConfig *mcfg, + int *first_output, int has_keyframe) { + if (*first_output || has_keyframe) { + *first_output = 0; + if (mcfg->enable_msdo) tu_assembler_write_msdo(ta); + if (mcfg->enable_global_lcr || mcfg->enable_local_lcr) + tu_assembler_write_global_lcr(ta); + tu_assembler_write_ops(ta, GLOBAL_XLAYER_ID); + if (mcfg->enable_atlas) tu_assembler_write_atlas(ta); + } +} + +// Rewrite an OBU's header with a new xlayer_id and append it to the assembler. +// obu_start points to the OBU data, obu_size is the total OBU size (header + +// payload), and obu_header_size is 1 or 2 bytes. +static void rewrite_and_append_obu(TUAssembler *ta, const uint8_t *obu_start, + size_t obu_size, int obu_header_size, + int xlayer_id) { + ObuHeader hdr; + memset(&hdr, 0, sizeof(hdr)); + parse_obu_header_byte(obu_start[0], &hdr); + if (hdr.obu_header_extension_flag) parse_obu_ext_byte(obu_start[1], &hdr); + uint8_t new_header[2]; + write_obu_header_with_xlayer(new_header, &hdr, xlayer_id); + const uint8_t *payload = obu_start + obu_header_size; + size_t payload_size = obu_size - (size_t)obu_header_size; + uint64_t new_obu_total_size = 2 + payload_size; + append_uleb128(ta, new_obu_total_size); + append_bytes(ta, new_header, 2); + if (payload_size > 0) append_bytes(ta, payload, payload_size); +} + +int tu_assembler_write_split_tus(TUAssembler *ta, const MultiXLayerConfig *mcfg, + int xlayer_id, const uint8_t *data, + size_t size, int *first_output, + FILE *outfile) { + // Preserve the encoder's frame order exactly to maintain DPB consistency. + // + // In multi_layers_lag_test mode, the encoder codes hidden frames (ARF, + // INTNL_ARF) followed by the displayable frame for each mlayer, then + // repeats for the next mlayer. It inserts a TD before each group of + // frames that belong to the same temporal unit. We respect these TDs + // as TU boundaries, bundling hidden frames with their displayable frame + // into a single TU. + + // Single-pass: parse all OBUs into a stack-allocated array. + // A typical encoder packet contains at most a few dozen OBUs per TU + // (TD + SH + MFH + QM + FGM + CI + BRT + frame OBUs per mlayer). + // 256 entries is generous for any realistic configuration. + typedef struct { + size_t data_offset; // start of OBU data (after length field) + size_t data_size; // OBU total size (header + payload) + int type; + int mlayer_id; + int is_td; + int is_structural; + int is_keyframe; + int obu_header_size; // 1 or 2 bytes + } ObuEntry; + + enum { MAX_OBU_ENTRIES = 256 }; + ObuEntry obus[MAX_OBU_ENTRIES]; + int num_obus = 0; + + { + size_t consumed = 0; + while (consumed < size && num_obus < MAX_OBU_ENTRIES) { + size_t length_field_size = 0; + uint64_t obu_total_size = 0; + if (avm_uleb_decode(data + consumed, size - consumed, &obu_total_size, + &length_field_size) != 0) + break; + if (obu_total_size == 0 || + consumed + length_field_size + obu_total_size > size) + break; + const uint8_t *obu_start = data + consumed + length_field_size; + ObuHeader hdr; + memset(&hdr, 0, sizeof(hdr)); + parse_obu_header_byte(obu_start[0], &hdr); + if (hdr.obu_header_extension_flag) parse_obu_ext_byte(obu_start[1], &hdr); + + obus[num_obus].data_offset = consumed + length_field_size; + obus[num_obus].data_size = (size_t)obu_total_size; + obus[num_obus].type = hdr.type; + obus[num_obus].mlayer_id = hdr.obu_mlayer_id; + obus[num_obus].is_td = (hdr.type == OBU_TEMPORAL_DELIMITER); + obus[num_obus].is_structural = + (hdr.type == OBU_MULTI_STREAM_DECODER_OPERATION || + hdr.type == OBU_LAYER_CONFIGURATION_RECORD || + hdr.type == OBU_OPERATING_POINT_SET || + hdr.type == OBU_ATLAS_SEGMENT); + obus[num_obus].is_keyframe = (hdr.type == OBU_CLOSED_LOOP_KEY); + obus[num_obus].obu_header_size = + 1 + (hdr.obu_header_extension_flag ? 1 : 0); + + consumed += length_field_size + (size_t)obu_total_size; + num_obus++; + } + } + + if (num_obus == 0) return 0; + + // Respect the encoder's TD placement to form TUs. The encoder inserts a + // TD before each group of hidden + displayable frames that belong to the + // same temporal unit. All frames between two consecutive encoder TDs are + // bundled into a single output TU, keeping hidden frames together with + // their displayable frame as the spec requires. + int tu_count = 0; + int tu_started = 0; // 1 once we've written our TD for the current TU + int structural_written = 0; // 1 once structural OBUs written for current TU + int pending_start = -1; // Start of non-frame OBUs preceding a frame + + for (int i = 0; i < num_obus; i++) { + if (obus[i].is_td) { + // Encoder TD marks start of a new temporal unit. + // Flush the previous TU if one was started. + if (tu_started) { + tu_assembler_flush(ta, outfile); + tu_count++; + } + // Begin new TU with our own TD. + ta->size = 0; + tu_assembler_write_td(ta); + tu_started = 1; + structural_written = 0; + pending_start = -1; + continue; + } + + if (obus[i].is_structural) continue; // skip encoder structural OBUs + + int is_frame = (obus[i].type != OBU_SEQUENCE_HEADER) && + (obus[i].type != OBU_MULTI_FRAME_HEADER) && + (obus[i].type != OBU_BUFFER_REMOVAL_TIMING) && + (obus[i].type != OBU_QUANTIZATION_MATRIX) && + (obus[i].type != OBU_FILM_GRAIN_MODEL) && + (obus[i].type != OBU_CONTENT_INTERPRETATION); + + if (!is_frame) { + // Track where non-frame OBUs start + if (pending_start < 0) pending_start = i; + continue; + } + + // Frame OBU — append to the current TU. + if (!tu_started) { + // Frame without a preceding encoder TD (e.g. ml>0 frames in a + // separate call). Start a new TU. + ta->size = 0; + tu_assembler_write_td(ta); + tu_started = 1; + structural_written = 0; + } + + // Write structural OBUs once per TU, before the first frame that + // needs them. tu_assembler_write_structural_obus() has its own + // first_output / keyframe guard, so calling it for each frame is safe + // — it will only emit once. + if (!structural_written) { + tu_assembler_write_structural_obus(ta, mcfg, first_output, + obus[i].is_keyframe); + structural_written = 1; + } + + // Write any non-frame OBUs that preceded this frame (SH, etc.) + for (int j = (pending_start >= 0 ? pending_start : i); j < i; j++) { + if (obus[j].is_td || obus[j].is_structural) continue; + rewrite_and_append_obu(ta, data + obus[j].data_offset, obus[j].data_size, + obus[j].obu_header_size, xlayer_id); + } + pending_start = -1; + + // Write the frame OBU itself with xlayer_id + rewrite_and_append_obu(ta, data + obus[i].data_offset, obus[i].data_size, + obus[i].obu_header_size, xlayer_id); + } + + // Flush the last TU if one is in progress. + if (tu_started) { + tu_assembler_flush(ta, outfile); + tu_count++; + } + + return tu_count; +} + +int tu_assembler_parse_tu_segments(const uint8_t *data, size_t size, + TUSegmentInfo *segs, int max_segs) { + // Scan OBUs, splitting at TD boundaries. Each segment starts at a TD + // and extends to just before the next TD (or end of data). + int nseg = 0; + size_t seg_start = 0; + int has_kf = 0; + int in_segment = 0; + size_t consumed = 0; + + while (consumed < size) { + size_t length_field_size = 0; + uint64_t obu_total_size = 0; + if (avm_uleb_decode(data + consumed, size - consumed, &obu_total_size, + &length_field_size) != 0) + break; + if (obu_total_size == 0 || + consumed + length_field_size + obu_total_size > size) + break; + + const uint8_t *obu_start = data + consumed + length_field_size; + ObuHeader hdr; + memset(&hdr, 0, sizeof(hdr)); + parse_obu_header_byte(obu_start[0], &hdr); + + size_t obu_end = consumed + length_field_size + (size_t)obu_total_size; + + if (hdr.type == OBU_TEMPORAL_DELIMITER) { + // Close previous segment if any + if (in_segment && nseg < max_segs) { + segs[nseg].offset = seg_start; + segs[nseg].size = consumed - seg_start; + segs[nseg].has_keyframe = has_kf; + nseg++; + } + // Start new segment at this TD + seg_start = consumed; + has_kf = 0; + in_segment = 1; + } else if (hdr.type == OBU_CLOSED_LOOP_KEY) { + has_kf = 1; + } + + consumed = obu_end; + } + + // Close last segment + if (in_segment && consumed > seg_start && nseg < max_segs) { + segs[nseg].offset = seg_start; + segs[nseg].size = consumed - seg_start; + segs[nseg].has_keyframe = has_kf; + nseg++; + } + + return nseg; +} + +void tu_assembler_print_contents(const TUAssembler *ta, int tu_index) { + const uint8_t *buf = ta->buffer; + size_t buf_size = ta->size; + fprintf(stdout, "--- TU %d [%zu bytes] ---\n", tu_index, buf_size); + size_t pos = 0; + while (pos < buf_size) { + ObuHeader hdr; + size_t payload_size = 0; + size_t bytes_read = 0; + if (avm_read_obu_header_and_size(buf + pos, buf_size - pos, &hdr, + &payload_size, + &bytes_read) != AVM_CODEC_OK) + break; + size_t obu_total = bytes_read + payload_size; + fprintf(stdout, " %-36s xl:%2d ml:%d tl:%d %4zu bytes\n", + avm_obu_type_to_string(hdr.type), hdr.obu_xlayer_id, + hdr.obu_mlayer_id, hdr.obu_tlayer_id, obu_total); + pos += obu_total; + } +} + +// --- Structural OBU content population --- + +// Derive configuration_idc from the highest chroma format among the given +// xlayer profiles. See Table A.1 in annexA.c: +// 0 = C_MAIN_420_10 (4:0:0, 4:2:0) +// 1 = C_MAIN_422_10 (4:0:0, 4:2:0, 4:2:2) +// 2 = C_MAIN_444_10 (4:0:0, 4:2:0, 4:4:4) +static int derive_config_idc_from_profiles(const MultiXLayerConfig *mcfg, + uint32_t xlayer_map) { + int config_idc = 0; // C_MAIN_420_10 + for (int i = 0; i < mcfg->num_xlayers; i++) { + int id = mcfg->xlayers[i].xlayer_id; + if (!(xlayer_map & (1u << id))) continue; + unsigned int prof = mcfg->xlayers[i].profile; + if (prof == MAIN_444_10_IP1) { + config_idc = 2; // C_MAIN_444_10 — highest, can stop + break; + } else if (prof == MAIN_422_10_IP1 && config_idc < 1) { + config_idc = 1; // C_MAIN_422_10 + } + } + return config_idc; +} + +// Derive the aggregate level index for a set of xlayers identified by +// xlayer_map. The aggregate level is the smallest level whose constraints +// accommodate the combined resources of all constituent xlayers: +// 1. max_picture_size >= sum of all xlayers' picture sizes +// 2. max_decode_rate >= sum of all xlayers' decode rates (pic_size * fps) +// When frame_rate is 0 (not specified), only picture size is checked. +static int derive_aggregate_level(const MultiXLayerConfig *mcfg, + uint32_t xlayer_map) { + int64_t total_picture_size = 0; + int64_t total_decode_rate = 0; + int max_individual_level = 0; + double fps = mcfg->frame_rate; + + for (int i = 0; i < mcfg->num_xlayers; i++) { + int id = mcfg->xlayers[i].xlayer_id; + if (!(xlayer_map & (1u << id))) continue; + int64_t pic_size = + (int64_t)mcfg->xlayers[i].width * mcfg->xlayers[i].height; + total_picture_size += pic_size; + if (fps > 0) total_decode_rate += (int64_t)(pic_size * fps); + if ((int)mcfg->xlayers[i].level > max_individual_level) + max_individual_level = (int)mcfg->xlayers[i].level; + } + + // Walk the level table and find the smallest level that satisfies all + // constraints. The aggregate level must also be >= every individual level. + int agg_level = max_individual_level; + for (int l = 0; l < SEQ_LEVELS; l++) { + if (l < max_individual_level) continue; + if (av2_level_defs[l].max_picture_size < total_picture_size) continue; + if (fps > 0 && av2_level_defs[l].max_decode_rate < total_decode_rate) + continue; + agg_level = l; + break; + } + return agg_level; +} + +// Apply scaling mode to a dimension, returning the scaled size. +// Uses round-up division to match the encoder's internal scaling behavior. +void populate_global_lcr_from_config(const MultiXLayerConfig *mcfg, + GlobalLayerConfigurationRecord *glcr) { + memset(glcr, 0, sizeof(*glcr)); + + glcr->lcr_global_config_record_id = 1; + + // Build xlayer_map bitmask and xlayer ID list + uint32_t xlayer_map = 0; + for (int i = 0; i < mcfg->num_xlayers; i++) { + int id = mcfg->xlayers[i].xlayer_id; + xlayer_map |= (1u << id); + glcr->LcrXLayerID[i] = id; + } + glcr->lcr_xlayer_map = (int)xlayer_map; + glcr->LcrMaxNumXLayerCount = mcfg->num_xlayers; + + glcr->lcr_global_payload_present_flag = 1; + glcr->lcr_global_purpose_id = mcfg->lcr_purpose_id; + glcr->lcr_dependent_xlayers_flag = mcfg->lcr_dependent_xlayers_flag; + glcr->lcr_doh_constraint_flag = mcfg->lcr_doh_constraint_flag; + glcr->lcr_seq_profile_tier_level_info_present_flag = 1; + + // Derive aggregate configuration_idc from all xlayers + glcr->aggregate_ptl.lcr_config_idc = + derive_config_idc_from_profiles(mcfg, (uint32_t)xlayer_map); + + // Populate per-xlayer info + for (int i = 0; i < mcfg->num_xlayers; i++) { + const XLayerEncConfig *xl = &mcfg->xlayers[i]; + LCRXLayerInfo *xinfo = &glcr->xlayer_info[i]; + + // Representation info (resolution) + xinfo->lcr_rep_info_present_flag = 1; + xinfo->rep_params.lcr_max_pic_width = (int)xl->width; + xinfo->rep_params.lcr_max_pic_height = (int)xl->height; + + // Color info + if (xl->color_primaries >= 0) { + xinfo->lcr_xlayer_color_info_present_flag = 1; + xinfo->xlayer_col_params.layer_color_primaries = xl->color_primaries; + xinfo->xlayer_col_params.layer_transfer_characteristics = + xl->transfer_characteristics; + xinfo->xlayer_col_params.layer_matrix_coefficients = + xl->matrix_coefficients; + xinfo->xlayer_col_params.layer_full_range_flag = xl->full_range_flag; + } + + // Embedded layer info + if (xl->num_embedded_layers > 1 || xl->num_temporal_layers > 1) { + xinfo->lcr_embedded_layer_info_present_flag = 1; + struct EmbeddedLayerInfo *ml = &xinfo->mlayer_params; + ml->MLayerCount = xl->num_embedded_layers; + // mlayer_map: bitmask of embedded layers present + ml->lcr_mlayer_map = (1 << xl->num_embedded_layers) - 1; + for (int m = 0; m < xl->num_embedded_layers; m++) { + ml->LcrMlayerID[m] = m; + ml->lcr_layer_type[m] = xl->layer_type; + if (xl->layer_type == AUX_LAYER) { + ml->lcr_auxiliary_type[m] = xl->auxiliary_type; + } + ml->lcr_view_type[m] = xl->view_type; + ml->TLayerCount[m] = xl->num_temporal_layers; + ml->lcr_tlayer_map[m] = (1 << xl->num_temporal_layers) - 1; + // Set resolution flag based on scaling mode. + // lcr_max_expected_width/height signals the maximum frame dimensions + // that can appear for this mlayer. For scaled layers, this must be + // the xlayer's full resolution (not the scaled size) because the + // encoder may produce full-res frames (e.g., on keyframes that reset + // the resize state). + int sm = xl->scaling_mode[m]; + if (sm != AVME_NORMAL) { + ml->lcr_same_sh_max_resolution_flag[m] = 0; + ml->lcr_max_expected_width[m] = (int)xl->width; + ml->lcr_max_expected_height[m] = (int)xl->height; + } else { + ml->lcr_same_sh_max_resolution_flag[m] = 1; + } + // Populate dependency map from config + ml->lcr_dependent_layer_map[m] = + resolve_mlayer_dep_mask(&xl->mlayer_sources[m], m); + } + } else { + // Single embedded layer, single temporal layer + xinfo->lcr_embedded_layer_info_present_flag = 1; + struct EmbeddedLayerInfo *ml = &xinfo->mlayer_params; + ml->MLayerCount = 1; + ml->lcr_mlayer_map = 1; + ml->LcrMlayerID[0] = 0; + ml->lcr_layer_type[0] = xl->layer_type; + if (xl->layer_type == AUX_LAYER) { + ml->lcr_auxiliary_type[0] = xl->auxiliary_type; + } + ml->lcr_view_type[0] = xl->view_type; + ml->TLayerCount[0] = xl->num_temporal_layers; + ml->lcr_tlayer_map[0] = (1 << xl->num_temporal_layers) - 1; + ml->lcr_same_sh_max_resolution_flag[0] = 1; + } + + // Seq profile/tier/level info + glcr->seq_ptl[i].lcr_seq_profile_idc = xl->profile; + glcr->seq_ptl[i].lcr_max_level_idx = xl->level; + glcr->seq_ptl[i].lcr_tier_flag = xl->tier; + } + + // Derive aggregate level and tier from all xlayers + { + int max_tier = 0; + for (int i = 0; i < mcfg->num_xlayers; i++) { + if ((int)mcfg->xlayers[i].tier > max_tier) + max_tier = (int)mcfg->xlayers[i].tier; + } + glcr->aggregate_ptl.lcr_aggregate_level_idx = + derive_aggregate_level(mcfg, (uint32_t)xlayer_map); + glcr->aggregate_ptl.lcr_max_tier_flag = max_tier; + } +} + +void populate_ops_from_config(const OPSConfig *ops_cfg, int xlayer_id, + const MultiXLayerConfig *mcfg, + OperatingPointSet *ops) { + memset(ops, 0, sizeof(*ops)); + if (!ops_cfg->enable) return; + + ops->valid = 1; + ops->obu_xlayer_id = xlayer_id; + ops->ops_id = ops_cfg->ops_id; + ops->ops_cnt = ops_cfg->num_operating_points; + ops->ops_priority = ops_cfg->priority; + ops->ops_intent_present_flag = ops_cfg->intent_present_flag; + ops->ops_ptl_present_flag = ops_cfg->ptl_present_flag; + ops->ops_color_info_present_flag = ops_cfg->color_info_present_flag; + ops->ops_mlayer_info_idc = ops_cfg->mlayer_info_idc; + + for (int p = 0; p < ops_cfg->num_operating_points; p++) { + const OperatingPointConfig *opc = &ops_cfg->ops[p]; + OperatingPoint *op = &ops->op[p]; + + op->ops_intent_op = opc->intent; + op->ops_xlayer_map = (int)opc->xlayer_map; + op->ops_initial_display_delay = + BUFFER_POOL_MAX_SIZE; // default: not present + + // Derive XCount and OpsxLayerID from xlayer_map + op->XCount = 0; + for (int bit = 0; bit < (int)(MAX_NUM_XLAYERS - 1); bit++) { + if (opc->xlayer_map & (1u << bit)) { + op->OpsxLayerID[op->XCount] = bit; + op->XCount++; + } + } + + // Per-xlayer mlayer counts and map derivation + for (int x = 0; x < op->XCount; x++) { + int xl = op->OpsxLayerID[x]; + int ml_count = opc->mlayer_count[x]; + op->ops_mlayer_count[xl] = ml_count; + // Derive ops_mlayer_map: include the first ml_count mlayers + if (ml_count > 0 && ops->ops_mlayer_info_idc >= 1) { + op->mlayer_info.ops_mlayer_map[xl] = (1 << ml_count) - 1; + // Default: all temporal layers for each included mlayer + for (int m = 0; m < ml_count; m++) { + // Find the xlayer config to get num_temporal_layers + int tl_count = 1; + for (int j = 0; j < mcfg->num_xlayers; j++) { + if (mcfg->xlayers[j].xlayer_id == xl) { + tl_count = mcfg->xlayers[j].num_temporal_layers; + break; + } + } + op->mlayer_info.ops_tlayer_map[xl][m] = (1 << tl_count) - 1; + } + // For idc==2, use explicit info (not embedded OPS references) + if (ops->ops_mlayer_info_idc == 2) { + op->ops_mlayer_explicit_info_flag[xl] = 1; + } + } + } + + // Derive ops_config_idc from the profiles of constituent xlayers + op->ops_config_idc = derive_config_idc_from_profiles(mcfg, opc->xlayer_map); + + // Aggregate level/tier + if (opc->aggregate_level_idx >= 0) { + op->ops_aggregate_level_idx = opc->aggregate_level_idx; + } else { + // Derive: find smallest level accommodating summed picture sizes + op->ops_aggregate_level_idx = + derive_aggregate_level(mcfg, opc->xlayer_map); + } + + if (opc->max_tier_flag >= 0) { + op->ops_max_tier_flag = opc->max_tier_flag; + } else { + // Derive: max tier across constituent xlayers + for (int x = 0; x < op->XCount; x++) { + int xl_id = op->OpsxLayerID[x]; + for (int j = 0; j < mcfg->num_xlayers; j++) { + if (mcfg->xlayers[j].xlayer_id == xl_id) { + if ((int)mcfg->xlayers[j].tier > op->ops_max_tier_flag) + op->ops_max_tier_flag = (int)mcfg->xlayers[j].tier; + break; + } + } + } + } + + // Embedded OPS references + for (int x = 0; x < MAX_NUM_XLAYERS; x++) { + op->ops_embedded_ops_id[x] = opc->embedded_ops_id[x]; + op->ops_embedded_op_index[x] = opc->embedded_op_index[x]; + } + } +} + +void populate_atlas_from_config(const MultiXLayerConfig *mcfg, + AtlasSegmentInfo *atlas) { + memset(atlas, 0, sizeof(*atlas)); + if (!mcfg->enable_atlas) return; + + atlas->valid = 1; + atlas->obu_xlayer_id = GLOBAL_XLAYER_ID; + atlas->atlas_segment_id = 1; + atlas->atlas_segment_mode_idc = mcfg->atlas_mode; + + const int n = mcfg->num_xlayers; + + if (mcfg->atlas_mode == ENHANCED_ATLAS) { + // Enhanced Atlas: region grid from xlayer count/dimensions + struct AtlasRegionInfo *reg = &atlas->ats_reg_params; + + if (mcfg->atlas_uniform_spacing) { + // Auto-grid: N columns x 1 row + reg->ats_uniform_spacing_flag = 1; + reg->ats_num_region_columns_minus_1 = n - 1; + reg->ats_num_region_rows_minus_1 = 0; + + // Use first xlayer's dimensions as the uniform region size + reg->ats_region_width_minus_1 = (int)mcfg->xlayers[0].width - 1; + reg->ats_region_height_minus_1 = (int)mcfg->xlayers[0].height - 1; + reg->NumRegionsInAtlas = n; + + // Derive atlas dimensions + reg->AtlasWidth = (int)mcfg->xlayers[0].width * n; + reg->AtlasHeight = (int)mcfg->xlayers[0].height; + + // Single region per atlas segment (one xlayer per region) + atlas->ats_reg_seg_map.ats_single_region_per_atlas_segment_flag = 1; + atlas->ats_reg_seg_map.ats_num_atlas_segments_minus_1 = n - 1; + } else { + // Explicit positions: derive grid from per-xlayer atlas_pos_x/y. + // Collect unique X and Y boundaries to determine columns and rows. + reg->ats_uniform_spacing_flag = 0; + + // Collect unique column start positions and widths + int col_x[MAX_NUM_XLAYERS]; + int col_w[MAX_NUM_XLAYERS]; + int num_cols = 0; + int row_y[MAX_NUM_XLAYERS]; + int row_h[MAX_NUM_XLAYERS]; + int num_rows = 0; + + for (int i = 0; i < n; i++) { + int px = mcfg->xlayers[i].atlas_pos_x >= 0 + ? mcfg->xlayers[i].atlas_pos_x + : 0; + int py = mcfg->xlayers[i].atlas_pos_y >= 0 + ? mcfg->xlayers[i].atlas_pos_y + : 0; + int w = (int)mcfg->xlayers[i].width; + int h = (int)mcfg->xlayers[i].height; + + // Insert unique column + int found = 0; + for (int c = 0; c < num_cols; c++) { + if (col_x[c] == px) { + found = 1; + break; + } + } + if (!found) { + col_x[num_cols] = px; + col_w[num_cols] = w; + num_cols++; + } + + // Insert unique row + found = 0; + for (int r = 0; r < num_rows; r++) { + if (row_y[r] == py) { + found = 1; + break; + } + } + if (!found) { + row_y[num_rows] = py; + row_h[num_rows] = h; + num_rows++; + } + } + + // Sort columns by X position (simple insertion sort) + for (int i = 1; i < num_cols; i++) { + int kx = col_x[i], kw = col_w[i]; + int j = i - 1; + while (j >= 0 && col_x[j] > kx) { + col_x[j + 1] = col_x[j]; + col_w[j + 1] = col_w[j]; + j--; + } + col_x[j + 1] = kx; + col_w[j + 1] = kw; + } + + // Sort rows by Y position + for (int i = 1; i < num_rows; i++) { + int ky = row_y[i], kh = row_h[i]; + int j = i - 1; + while (j >= 0 && row_y[j] > ky) { + row_y[j + 1] = row_y[j]; + row_h[j + 1] = row_h[j]; + j--; + } + row_y[j + 1] = ky; + row_h[j + 1] = kh; + } + + reg->ats_num_region_columns_minus_1 = num_cols - 1; + reg->ats_num_region_rows_minus_1 = num_rows - 1; + for (int c = 0; c < num_cols; c++) + reg->ats_column_width_minus_1[c] = col_w[c] - 1; + for (int r = 0; r < num_rows; r++) + reg->ats_row_height_minus_1[r] = row_h[r] - 1; + reg->NumRegionsInAtlas = num_cols * num_rows; + + // Use explicit region-to-segment mapping since not all grid cells + // may be occupied (e.g., 3 regions in a 2x2 grid). + atlas->ats_reg_seg_map.ats_single_region_per_atlas_segment_flag = 0; + atlas->ats_reg_seg_map.ats_num_atlas_segments_minus_1 = n - 1; + + // Map each xlayer to its grid cell + for (int i = 0; i < n; i++) { + int px = mcfg->xlayers[i].atlas_pos_x >= 0 + ? mcfg->xlayers[i].atlas_pos_x + : 0; + int py = mcfg->xlayers[i].atlas_pos_y >= 0 + ? mcfg->xlayers[i].atlas_pos_y + : 0; + int col_idx = 0, row_idx = 0; + for (int c = 0; c < num_cols; c++) { + if (col_x[c] == px) { + col_idx = c; + break; + } + } + for (int r = 0; r < num_rows; r++) { + if (row_y[r] == py) { + row_idx = r; + break; + } + } + atlas->ats_reg_seg_map.ats_top_left_region_column[i] = col_idx; + atlas->ats_reg_seg_map.ats_top_left_region_row[i] = row_idx; + atlas->ats_reg_seg_map.ats_bottom_right_region_column_offset[i] = 0; + atlas->ats_reg_seg_map.ats_bottom_right_region_row_offset[i] = 0; + // Derived fields + atlas->ats_reg_seg_map.ats_bottom_right_region_column[i] = col_idx; + atlas->ats_reg_seg_map.ats_bottom_right_region_row[i] = row_idx; + } + } + + // No signalled segment IDs + atlas->ats_label_seg.ats_signalled_atlas_segment_ids_flag = 0; + + } else if (mcfg->atlas_mode == MULTISTREAM_ATLAS) { + // Multistream Atlas: per-segment positions from xlayer config + struct AtlasBasicInfo *basic = &atlas->ats_basic_info_s; + atlas->ats_basic_info = basic; + + basic->ats_stream_id_present = 1; + basic->ats_num_atlas_segments_minus_1 = n - 1; + + // Derive or use explicit atlas dimensions + if (mcfg->atlas_width > 0) { + basic->ats_atlas_width = mcfg->atlas_width; + basic->ats_atlas_height = mcfg->atlas_height; + } else { + // Auto-derive: horizontal tiling + int total_w = 0; + int max_h = 0; + for (int i = 0; i < n; i++) { + total_w += (int)mcfg->xlayers[i].width; + if ((int)mcfg->xlayers[i].height > max_h) + max_h = (int)mcfg->xlayers[i].height; + } + basic->ats_atlas_width = total_w; + basic->ats_atlas_height = max_h; + } + basic->AtlasWidth = basic->ats_atlas_width; + basic->AtlasHeight = basic->ats_atlas_height; + + // Per-segment info + int auto_x = 0; + for (int i = 0; i < n; i++) { + basic->ats_input_stream_id[i] = mcfg->xlayers[i].xlayer_id; + basic->ats_segment_width[i] = (int)mcfg->xlayers[i].width; + basic->ats_segment_height[i] = (int)mcfg->xlayers[i].height; + + if (mcfg->xlayers[i].atlas_pos_x >= 0) { + basic->ats_segment_top_left_pos_x[i] = mcfg->xlayers[i].atlas_pos_x; + basic->ats_segment_top_left_pos_y[i] = mcfg->xlayers[i].atlas_pos_y; + } else { + // Auto-place: horizontal tiling + basic->ats_segment_top_left_pos_x[i] = auto_x; + basic->ats_segment_top_left_pos_y[i] = 0; + } + auto_x += (int)mcfg->xlayers[i].width; + } + + // No signalled segment IDs + atlas->ats_label_seg.ats_signalled_atlas_segment_ids_flag = 0; + } +} diff --git a/common/tu_assembler.h b/common/tu_assembler.h new file mode 100644 index 0000000000..16eea688c0 --- /dev/null +++ b/common/tu_assembler.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#ifndef AVM_COMMON_TU_ASSEMBLER_H_ +#define AVM_COMMON_TU_ASSEMBLER_H_ + +#include +#include + +#include "av2/common/enums.h" +#include "av2/common/av2_common_int.h" +#include "common/xlayer_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define TU_ASM_INITIAL_CAPACITY (256 * 1024) + +typedef struct TUAssembler { + uint8_t *buffer; + size_t size; + size_t capacity; + int num_xlayers; + int xlayer_ids[MAX_NUM_XLAYERS - 1]; + // Structural OBU data populated from config + GlobalLayerConfigurationRecord global_lcr; + int msdo_enabled; + int num_ops_sets; + OperatingPointSet ops_list[MAX_NUM_OPS_ID]; + AtlasSegmentInfo atlas_info; + // Reference to the config for OBU population + const MultiXLayerConfig *config; +} TUAssembler; + +// Initialize assembler from multi-xlayer config +int tu_assembler_init(TUAssembler *ta, const MultiXLayerConfig *mcfg); + +// Free assembler resources +void tu_assembler_free(TUAssembler *ta); + +// Write a Temporal Delimiter OBU (xlayer_id=31) +int tu_assembler_write_td(TUAssembler *ta); + +// Write a Global LCR OBU +int tu_assembler_write_global_lcr(TUAssembler *ta); + +// Write a Local LCR OBU for the specified xlayer config index. +// The xlayer_info is copied from the Global LCR to ensure decoder-side +// consistency validation passes when both Global and Local LCRs are present. +int tu_assembler_write_local_lcr(TUAssembler *ta, int xlayer_idx); + +// Write an MSDO OBU +int tu_assembler_write_msdo(TUAssembler *ta); + +// Write an OPS OBU for the specified xlayer_id +int tu_assembler_write_ops(TUAssembler *ta, int xlayer_id); + +// Write an Atlas OBU +int tu_assembler_write_atlas(TUAssembler *ta); + +// Append per-xlayer OBUs from an encoder packet, rewriting OBU headers +// with the given xlayer_id. Skips per-xlayer TDs and structural OBUs. +int tu_assembler_append_xlayer_obus(TUAssembler *ta, int xlayer_id, + const uint8_t *data, size_t size); + +// Flush the assembled buffer to the output file and reset size to 0 +int tu_assembler_flush(TUAssembler *ta, FILE *outfile); + +// Write structural OBUs (LCR, OPS, Atlas) into the assembler buffer. +// Emits once per TU: only when *first_output is set or has_keyframe is true. +void tu_assembler_write_structural_obus(TUAssembler *ta, + const MultiXLayerConfig *mcfg, + int *first_output, int has_keyframe); + +// Split encoder output at internal TD boundaries and write each segment as +// a separate TU. This is used for multi_layers_lag_test mode where the +// encoder inserts TDs between implicit_output frames at different OrderHints +// to satisfy the DOH constraint. Each segment gets its own TD, structural +// OBUs (on first_output or keyframe), and xlayer-rewritten frame data. +// Returns the number of TUs written, or -1 on error. +int tu_assembler_write_split_tus(TUAssembler *ta, const MultiXLayerConfig *mcfg, + int xlayer_id, const uint8_t *data, + size_t size, int *first_output, FILE *outfile); + +// A parsed TU segment: a contiguous byte range of OBU data between two +// consecutive TD boundaries in an encoder's output. +#define MAX_TU_SEGMENTS 64 + +typedef struct TUSegmentInfo { + size_t offset; // start offset in the source data + size_t size; // byte size of this segment (including the TD) + int has_keyframe; // 1 if segment contains a keyframe OBU +} TUSegmentInfo; + +// Parse encoder output into TU segments split at TD boundaries. +// Each segment spans from one TD to the next (or end of data). +// Returns the number of segments found (stored in segs[]), or -1 on error. +int tu_assembler_parse_tu_segments(const uint8_t *data, size_t size, + TUSegmentInfo *segs, int max_segs); + +// Print a summary of all OBUs in the current assembled TU buffer to stdout. +// Must be called before tu_assembler_flush() (which resets the buffer). +void tu_assembler_print_contents(const TUAssembler *ta, int tu_index); + +// Populate a GlobalLayerConfigurationRecord from config +void populate_global_lcr_from_config(const MultiXLayerConfig *mcfg, + GlobalLayerConfigurationRecord *glcr); + +// Populate an OperatingPointSet from config +void populate_ops_from_config(const OPSConfig *ops_cfg, int xlayer_id, + const MultiXLayerConfig *mcfg, + OperatingPointSet *ops); + +// Populate AtlasSegmentInfo from config +void populate_atlas_from_config(const MultiXLayerConfig *mcfg, + AtlasSegmentInfo *atlas); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AVM_COMMON_TU_ASSEMBLER_H_ diff --git a/common/xlayer_config.h b/common/xlayer_config.h new file mode 100644 index 0000000000..a72c6f3319 --- /dev/null +++ b/common/xlayer_config.h @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#ifndef AVM_COMMON_XLAYER_CONFIG_H_ +#define AVM_COMMON_XLAYER_CONFIG_H_ + +#include +#include +#include + +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +#include "av2/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_INPUT_SOURCES 8 +#define MAX_SOURCE_NAME_LEN 64 +#define MAX_CODEC_CONTROLS 32 + +// Named input source for multi-source encoding +typedef struct InputSourceConfig { + char name[MAX_SOURCE_NAME_LEN]; + char filename[PATH_MAX]; + unsigned int width; + unsigned int height; + int format; // pixel format: 0=auto, 420, 422, 444 + int bit_depth; // 0=auto (detect from file or default 8) + int frame_rate_num; // 0/0=auto (detect from Y4M or use global timebase) + int frame_rate_den; + int frame_skip; // resolved: max_fps/this_fps (1=every TU, 2=every other) +} InputSourceConfig; + +// Per-embedded-layer source and dependency configuration +typedef struct MLayerSourceConfig { + char input_source_name[MAX_SOURCE_NAME_LEN]; // "" = inherit from xlayer + int input_source_idx; // resolved: -1 = inherit from xlayer + int atlas_pos_x; // crop origin X (-1 = inherit from xlayer) + int atlas_pos_y; // crop origin Y (-1 = inherit from xlayer) + unsigned int width; // crop width (0 = inherit from xlayer) + unsigned int height; // crop height (0 = inherit from xlayer) + int dependency_mask; // bitmask of lower mlayers this depends on + // (-1 = default linear chain) + // Content Interpretation overrides (-1 = inherit from xlayer) + int color_primaries; + int transfer_characteristics; + int matrix_coefficients; + int full_range_flag; +} MLayerSourceConfig; + +// Default mlayer dependency mask: linear chain where each mlayer depends on +// all lower mlayers. mlayer 0 has mask 0 (no dependencies). +#define DEFAULT_MLAYER_DEP_MASK(m) ((m) > 0 ? (1 << (m)) - 1 : 0) + +// Resolve a per-mlayer dependency mask, replacing the sentinel -1 with the +// default linear chain. +static inline int resolve_mlayer_dep_mask(const MLayerSourceConfig *ms, int m) { + return (ms->dependency_mask >= 0) ? ms->dependency_mask + : DEFAULT_MLAYER_DEP_MASK(m); +} + +// Per extended-layer encoder configuration +typedef struct XLayerEncConfig { + int xlayer_id; // 0-30 + char input_filename[PATH_MAX]; + unsigned int width; + unsigned int height; + unsigned int profile; + unsigned int tier; + unsigned int level; + int layer_type; // TEXTURE_LAYER, AUX_LAYER, STEREO_LAYER, etc. + int auxiliary_type; // LCR_ALPHA_AUX, LCR_DEPTH_AUX, etc. (if AUX_LAYER) + int view_type; // VIEW_UNSPECIFIED, VIEW_LEFT, VIEW_RIGHT, etc. + int num_temporal_layers; + int num_embedded_layers; + // Color info + int color_primaries; + int transfer_characteristics; + int matrix_coefficients; + int full_range_flag; + // Encoder overrides (-1 = use global default) + int qp; + int bitrate; + int cpu_used; + int lag_in_frames; + int sframe_dist; // S-Frame interval (-1 = disabled/default) + int sframe_mode; // S-Frame insertion mode (-1 = default) + int sframe_type; // S-Frame type: 0=regular, 1=RAS (-1 = default) + // Coding structure (-1 or empty = use global default) + int kf_max_dist; // keyframe interval (-1 = default) + char subgop_config_path[PATH_MAX]; // sub-GOP config file (empty = default) + // GOP mode: 0=closed(CLK), 1=open_leading(OLK), 2=open_sef + int gop_mode; + int fwd_kf_enabled; // override: -1=derive from gop_mode + int enable_keyframe_filtering; // override: -1=derive from gop_mode + int add_sef_for_hidden_frames; // override: -1=derive from gop_mode + // Atlas layout position in composite canvas (-1 = auto) + int atlas_pos_x; + int atlas_pos_y; + // Input source reference (for multi-source encoding) + char input_source_name[MAX_SOURCE_NAME_LEN]; // references InputSourceConfig + int input_source_idx; // resolved index into input_sources[] (-1 = own file) + // Scaling for embedded layers + int scaling_mode[MAX_NUM_MLAYERS]; + // Per-embedded-layer source and dependency configuration + MLayerSourceConfig mlayer_sources[MAX_NUM_MLAYERS]; + int has_per_mlayer_sources; // 1 if any mlayer has its own source/crop + int has_mlayer_dependencies; // 1 if any mlayer has explicit dependency_mask + // Generic post-init codec controls from JSON "codec_controls" array + int num_codec_controls; + struct { + char name[64]; + int value; + } codec_controls[MAX_CODEC_CONTROLS]; +} XLayerEncConfig; + +// Per operating-point configuration within an OPS set +typedef struct OperatingPointConfig { + int intent; // OPS intent (display, monitoring, etc.) + uint32_t xlayer_map; // bitmask of xlayers included in this OP + // Per-xlayer within this OP + int mlayer_count[MAX_NUM_XLAYERS]; // embedded layers per xlayer (0=all) + int tlayer_count[MAX_NUM_XLAYERS]; // temporal layers per xlayer (0=all) + // PTL overrides for this OP + int aggregate_level_idx; // -1 = derive from constituent layers + int max_tier_flag; // -1 = derive + // Per-xlayer embedded OPS references + int embedded_ops_id[MAX_NUM_XLAYERS]; // -1 = not set + int embedded_op_index[MAX_NUM_XLAYERS]; // -1 = not set +} OperatingPointConfig; + +// OPS set configuration (one per OPS OBU) +typedef struct OPSConfig { + int enable; + int ops_id; // OPS ID (0-15) + int priority; // OPS priority + int intent_present_flag; + int ptl_present_flag; + int color_info_present_flag; + int mlayer_info_idc; // 0=no info, 1=same, 2=explicit + int num_operating_points; + OperatingPointConfig ops[MAX_OPS_COUNT]; +} OPSConfig; + +// Top-level multi-xlayer configuration +typedef struct MultiXLayerConfig { + int num_xlayers; + XLayerEncConfig xlayers[MAX_NUM_XLAYERS - 1]; // up to 31 + // Global LCR + int enable_global_lcr; + int lcr_purpose_id; + int lcr_dependent_xlayers_flag; + int lcr_doh_constraint_flag; + // Local LCR + int enable_local_lcr; + int local_lcr_mode; // 0 = both (Global+Local, identical xlayer_info) + // 1 = local_only (Global without payload, Local is + // authoritative) + // MSDO + int enable_msdo; + // Atlas + int enable_atlas; + int atlas_mode; + int atlas_width; // canvas width (0 = derive from xlayers) + int atlas_height; // canvas height (0 = derive) + int atlas_uniform_spacing; // 1 = auto-grid, 0 = explicit positions + // OPS + int num_ops_sets; + OPSConfig ops_sets[MAX_NUM_OPS_ID]; + // Shared source (for subpicture encoding from single input) + char source_filename[PATH_MAX]; // shared source file (empty = disabled) + unsigned int source_width; // source resolution (0 = derive from file) + unsigned int source_height; + // Named input sources (replaces single source for multi-source encoding) + int num_input_sources; + InputSourceConfig input_sources[MAX_INPUT_SOURCES]; + // Bitstream + int combined_tu; + int monotonic_output_order; + double frame_rate; // 0 = use main encoder timebase (default) + int limit; // max frames to encode (0 = unlimited) + char output_filename[PATH_MAX]; +} MultiXLayerConfig; + +// Initialize config with defaults +static inline void xlayer_config_init(MultiXLayerConfig *cfg) { + memset(cfg, 0, sizeof(*cfg)); + cfg->enable_global_lcr = 1; + cfg->lcr_doh_constraint_flag = 1; + cfg->atlas_uniform_spacing = 1; + cfg->combined_tu = 1; + cfg->monotonic_output_order = 1; + for (int i = 0; i < MAX_NUM_XLAYERS - 1; i++) { + cfg->xlayers[i].xlayer_id = -1; + cfg->xlayers[i].qp = -1; + cfg->xlayers[i].bitrate = -1; + cfg->xlayers[i].cpu_used = -1; + cfg->xlayers[i].lag_in_frames = -1; + cfg->xlayers[i].sframe_dist = -1; + cfg->xlayers[i].sframe_mode = -1; + cfg->xlayers[i].sframe_type = -1; + cfg->xlayers[i].kf_max_dist = -1; + cfg->xlayers[i].fwd_kf_enabled = -1; + cfg->xlayers[i].enable_keyframe_filtering = -1; + cfg->xlayers[i].add_sef_for_hidden_frames = -1; + cfg->xlayers[i].atlas_pos_x = -1; + cfg->xlayers[i].atlas_pos_y = -1; + cfg->xlayers[i].input_source_idx = -1; + cfg->xlayers[i].profile = MAIN_420_10_IP1; + cfg->xlayers[i].level = SEQ_LEVEL_4_0; + cfg->xlayers[i].num_temporal_layers = 1; + cfg->xlayers[i].num_embedded_layers = 1; + cfg->xlayers[i].view_type = VIEW_UNSPECIFIED; + for (int j = 0; j < MAX_NUM_MLAYERS; j++) { + cfg->xlayers[i].mlayer_sources[j].input_source_idx = -1; + cfg->xlayers[i].mlayer_sources[j].atlas_pos_x = -1; + cfg->xlayers[i].mlayer_sources[j].atlas_pos_y = -1; + cfg->xlayers[i].mlayer_sources[j].width = 0; + cfg->xlayers[i].mlayer_sources[j].height = 0; + cfg->xlayers[i].mlayer_sources[j].dependency_mask = -1; + cfg->xlayers[i].mlayer_sources[j].color_primaries = -1; + cfg->xlayers[i].mlayer_sources[j].transfer_characteristics = -1; + cfg->xlayers[i].mlayer_sources[j].matrix_coefficients = -1; + cfg->xlayers[i].mlayer_sources[j].full_range_flag = -1; + } + } + for (int i = 0; i < MAX_NUM_OPS_ID; i++) { + for (int j = 0; j < MAX_OPS_COUNT; j++) { + cfg->ops_sets[i].ops[j].aggregate_level_idx = -1; + cfg->ops_sets[i].ops[j].max_tier_flag = -1; + for (int k = 0; k < MAX_NUM_XLAYERS; k++) { + cfg->ops_sets[i].ops[j].embedded_ops_id[k] = -1; + cfg->ops_sets[i].ops[j].embedded_op_index[k] = -1; + } + } + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AVM_COMMON_XLAYER_CONFIG_H_ diff --git a/common/xlayer_config_parse.c b/common/xlayer_config_parse.c new file mode 100644 index 0000000000..d304a8910c --- /dev/null +++ b/common/xlayer_config_parse.c @@ -0,0 +1,1273 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#include "common/xlayer_config_parse.h" + +#include +#include +#include +#include + +#include "avm/avmcx.h" +#include "third_party/cJSON/cJSON.h" + +// Map layer type string to enum value +static int parse_layer_type(const char *str) { + if (!str) return TEXTURE_LAYER; + if (strcmp(str, "texture") == 0) return TEXTURE_LAYER; + if (strcmp(str, "auxiliary") == 0) return AUX_LAYER; + if (strcmp(str, "stereo") == 0) return STEREO_LAYER; + if (strcmp(str, "dependent") == 0) return DEPENDENT_LAYER; + fprintf(stderr, "Warning: unknown layer_type \"%s\", defaulting to texture\n", + str); + return TEXTURE_LAYER; +} + +// Map auxiliary type string to enum value +static int parse_auxiliary_type(const char *str) { + if (!str) return LCR_ALPHA_AUX; + if (strcmp(str, "alpha") == 0) return LCR_ALPHA_AUX; + if (strcmp(str, "depth") == 0) return LCR_DEPTH_AUX; + if (strcmp(str, "segmentation") == 0) return LCR_SEGMENTATION_AUX; + if (strcmp(str, "gain_map") == 0) return LCR_GAIN_MAP_AUX; + fprintf(stderr, + "Warning: unknown auxiliary_type \"%s\", defaulting to alpha\n", str); + return LCR_ALPHA_AUX; +} + +// Map view type string to enum value +static int parse_view_type(const char *str) { + if (!str) return VIEW_UNSPECIFIED; + if (strcmp(str, "unspecified") == 0) return VIEW_UNSPECIFIED; + if (strcmp(str, "center") == 0) return VIEW_CENTER; + if (strcmp(str, "left") == 0) return VIEW_LEFT; + if (strcmp(str, "right") == 0) return VIEW_RIGHT; + if (strcmp(str, "explicit") == 0) return VIEW_EXPLICIT; + fprintf(stderr, + "Warning: unknown view_type \"%s\", defaulting to " + "unspecified\n", + str); + return VIEW_UNSPECIFIED; +} + +// Map scaling mode string to enum value, returns -1 on error +static int parse_scaling_mode(const char *str) { + if (!str) return -1; + if (strcmp(str, "1:1") == 0 || strcmp(str, "normal") == 0) return AVME_NORMAL; + if (strcmp(str, "4/5") == 0) return AVME_FOURFIVE; + if (strcmp(str, "3/5") == 0) return AVME_THREEFIVE; + if (strcmp(str, "3/4") == 0) return AVME_THREEFOUR; + if (strcmp(str, "1/4") == 0) return AVME_ONEFOUR; + if (strcmp(str, "1/8") == 0) return AVME_ONEEIGHT; + if (strcmp(str, "1/2") == 0) return AVME_ONETWO; + fprintf(stderr, "Warning: unknown scaling_mode \"%s\"\n", str); + return -1; +} + +// Map GOP mode string to integer value +static int parse_gop_mode(const char *str) { + if (!str) return 0; + if (strcmp(str, "closed") == 0) return 0; + if (strcmp(str, "open_leading") == 0) return 1; + if (strcmp(str, "open_sef") == 0) return 2; + fprintf(stderr, "Warning: unknown gop_mode \"%s\", defaulting to closed\n", + str); + return 0; +} + +// Map chroma format string to integer (420, 422, 444). Returns 0 on error. +static int parse_chroma_format(const char *str) { + if (!str) return 0; + if (strcmp(str, "yuv420") == 0 || strcmp(str, "420") == 0) return 420; + if (strcmp(str, "yuv422") == 0 || strcmp(str, "422") == 0) return 422; + if (strcmp(str, "yuv444") == 0 || strcmp(str, "444") == 0) return 444; + fprintf(stderr, "Warning: unknown format \"%s\"\n", str); + return 0; +} + +// Helper: warn about unknown keys in a JSON object +static void warn_unknown_keys(const cJSON *obj, const char *const known[], + int num_known, const char *section) { + const cJSON *item = NULL; + cJSON_ArrayForEach(item, obj) { + if (strcmp(item->string, "comment") == 0) continue; + int found = 0; + for (int i = 0; i < num_known; i++) { + if (strcmp(item->string, known[i]) == 0) { + found = 1; + break; + } + } + if (!found) + fprintf(stderr, "Warning: unknown key \"%s\" in %s (ignored)\n", + item->string, section); + } +} + +// Helper: get integer from JSON, or default +static int json_get_int(const cJSON *obj, const char *key, int default_val) { + const cJSON *item = cJSON_GetObjectItemCaseSensitive(obj, key); + if (cJSON_IsNumber(item)) return item->valueint; + return default_val; +} + +// Helper: get boolean from JSON, or default +static int json_get_bool(const cJSON *obj, const char *key, int default_val) { + const cJSON *item = cJSON_GetObjectItemCaseSensitive(obj, key); + if (cJSON_IsTrue(item)) return 1; + if (cJSON_IsFalse(item)) return 0; + if (cJSON_IsNumber(item)) return item->valueint != 0; + return default_val; +} + +// Helper: get string from JSON, or default +static const char *json_get_string(const cJSON *obj, const char *key, + const char *default_val) { + const cJSON *item = cJSON_GetObjectItemCaseSensitive(obj, key); + if (cJSON_IsString(item)) return item->valuestring; + return default_val; +} + +// Helper: parse frame_rate from JSON into num/den rational. +// Accepts: number (e.g., 30 -> 30/1), string "N/D" (e.g., "30000/1001"). +// Returns 0 on success (or no field present), -1 on error. +static int json_parse_frame_rate(const cJSON *obj, const char *key, int *num, + int *den) { + const cJSON *item = cJSON_GetObjectItemCaseSensitive(obj, key); + if (!item) return 0; + if (cJSON_IsNumber(item)) { + // Integer or float -> convert to rational + double v = item->valuedouble; + if (v <= 0.0) return 0; + // Check if it's an integer + if (v == (double)(int)v) { + *num = (int)v; + *den = 1; + } else { + // Common fractional rates: multiply by 1001 to check for NTSC + double v1001 = v * 1001.0; + if (fabs(v1001 - round(v1001)) < 0.01) { + *num = (int)round(v1001); + *den = 1001; + } else { + // General: use 1000x scale + *num = (int)round(v * 1000.0); + *den = 1000; + } + } + return 0; + } + if (cJSON_IsString(item)) { + int n = 0, d = 0; + if (sscanf(item->valuestring, "%d/%d", &n, &d) == 2 && d > 0) { + *num = n; + *den = d; + return 0; + } + // Try plain integer string + if (sscanf(item->valuestring, "%d", &n) == 1 && n > 0) { + *num = n; + *den = 1; + return 0; + } + fprintf(stderr, "Error: invalid frame_rate \"%s\"\n", item->valuestring); + return -1; + } + return 0; +} + +// Read entire file into a malloc'd string +static char *read_file_contents(const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { + fprintf(stderr, "Error: cannot open config file \"%s\"\n", path); + return NULL; + } + fseek(f, 0, SEEK_END); + long len = ftell(f); + fseek(f, 0, SEEK_SET); + if (len <= 0) { + fclose(f); + fprintf(stderr, "Error: config file \"%s\" is empty\n", path); + return NULL; + } + char *buf = (char *)malloc((size_t)len + 1); + if (!buf) { + fclose(f); + return NULL; + } + size_t read_len = fread(buf, 1, (size_t)len, f); + fclose(f); + buf[read_len] = '\0'; + return buf; +} + +// Parse a single xlayer entry from JSON into XLayerEncConfig +static int parse_xlayer_entry(const cJSON *entry, XLayerEncConfig *xlcfg) { + xlcfg->xlayer_id = json_get_int(entry, "xlayer_id", -1); + if (xlcfg->xlayer_id < 0 || xlcfg->xlayer_id > 30) { + fprintf(stderr, "Error: xlayer_id must be 0-30, got %d\n", + xlcfg->xlayer_id); + return -1; + } + + const char *input = json_get_string(entry, "input", NULL); + if (input) { + snprintf(xlcfg->input_filename, PATH_MAX, "%s", input); + } + + // Input source reference (for multi-source mode) + const char *isrc = json_get_string(entry, "input_source", NULL); + if (isrc) { + snprintf(xlcfg->input_source_name, MAX_SOURCE_NAME_LEN, "%s", isrc); + } + + xlcfg->width = (unsigned int)json_get_int(entry, "width", 0); + xlcfg->height = (unsigned int)json_get_int(entry, "height", 0); + xlcfg->profile = + (unsigned int)json_get_int(entry, "profile", MAIN_420_10_IP1); + xlcfg->tier = (unsigned int)json_get_int(entry, "tier", 0); + xlcfg->level = (unsigned int)json_get_int(entry, "level", SEQ_LEVEL_4_0); + + const char *lt = json_get_string(entry, "layer_type", "texture"); + xlcfg->layer_type = parse_layer_type(lt); + + if (xlcfg->layer_type == AUX_LAYER) { + const char *at = json_get_string(entry, "auxiliary_type", "alpha"); + xlcfg->auxiliary_type = parse_auxiliary_type(at); + } + + const char *vt = json_get_string(entry, "view_type", NULL); + if (vt) xlcfg->view_type = parse_view_type(vt); + + xlcfg->num_temporal_layers = json_get_int(entry, "num_temporal_layers", 1); + xlcfg->num_embedded_layers = json_get_int(entry, "num_embedded_layers", 1); + + // Color info + xlcfg->color_primaries = json_get_int(entry, "color_primaries", -1); + xlcfg->transfer_characteristics = + json_get_int(entry, "transfer_characteristics", -1); + xlcfg->matrix_coefficients = json_get_int(entry, "matrix_coefficients", -1); + xlcfg->full_range_flag = json_get_int(entry, "full_range_flag", -1); + + // Encoder overrides + xlcfg->qp = json_get_int(entry, "qp", -1); + xlcfg->bitrate = json_get_int(entry, "bitrate", -1); + xlcfg->cpu_used = json_get_int(entry, "cpu_used", -1); + xlcfg->lag_in_frames = json_get_int(entry, "lag_in_frames", -1); + + // S-Frame parameters + xlcfg->sframe_dist = json_get_int(entry, "sframe_dist", -1); + xlcfg->sframe_mode = json_get_int(entry, "sframe_mode", -1); + xlcfg->sframe_type = json_get_int(entry, "sframe_type", -1); + + // Coding structure overrides + xlcfg->kf_max_dist = json_get_int(entry, "kf_max_dist", -1); + const char *subgop = json_get_string(entry, "subgop_config", NULL); + if (subgop) { + snprintf(xlcfg->subgop_config_path, PATH_MAX, "%s", subgop); + } + + // GOP mode and overrides + const char *gop = json_get_string(entry, "gop_mode", NULL); + if (gop) xlcfg->gop_mode = parse_gop_mode(gop); + xlcfg->fwd_kf_enabled = json_get_int(entry, "fwd_kf_enabled", -1); + xlcfg->enable_keyframe_filtering = + json_get_int(entry, "enable_keyframe_filtering", -1); + xlcfg->add_sef_for_hidden_frames = + json_get_int(entry, "add_sef_for_hidden_frames", -1); + + // Atlas layout position + xlcfg->atlas_pos_x = json_get_int(entry, "atlas_pos_x", -1); + xlcfg->atlas_pos_y = json_get_int(entry, "atlas_pos_y", -1); + + // Scaling modes for embedded layers (flat array format) + const cJSON *scaling = + cJSON_GetObjectItemCaseSensitive(entry, "scaling_mode"); + const cJSON *el_arr = + cJSON_GetObjectItemCaseSensitive(entry, "embedded_layers"); + + if (cJSON_IsArray(scaling) && cJSON_IsArray(el_arr)) { + fprintf(stderr, + "Error: xlayer %d has both \"scaling_mode\" array and " + "\"embedded_layers\" — these are mutually exclusive\n", + xlcfg->xlayer_id); + return -1; + } + + int scaling_modes_explicit = 0; + + if (cJSON_IsArray(scaling)) { + scaling_modes_explicit = 1; + int n = cJSON_GetArraySize(scaling); + if (n > MAX_NUM_MLAYERS) n = MAX_NUM_MLAYERS; + for (int i = 0; i < n; i++) { + const cJSON *s = cJSON_GetArrayItem(scaling, i); + if (cJSON_IsNumber(s)) { + xlcfg->scaling_mode[i] = s->valueint; + } else if (cJSON_IsString(s)) { + int mode = parse_scaling_mode(s->valuestring); + if (mode < 0) { + fprintf(stderr, "Error: invalid scaling_mode \"%s\" for xlayer %d\n", + s->valuestring, xlcfg->xlayer_id); + return -1; + } + xlcfg->scaling_mode[i] = mode; + } + } + } + + // Per-embedded-layer configuration (new format) + if (cJSON_IsArray(el_arr)) { + int n = cJSON_GetArraySize(el_arr); + if (n != xlcfg->num_embedded_layers) { + fprintf(stderr, + "Error: xlayer %d \"embedded_layers\" array length %d does not " + "match num_embedded_layers %d\n", + xlcfg->xlayer_id, n, xlcfg->num_embedded_layers); + return -1; + } + for (int m = 0; m < n; m++) { + const cJSON *el = cJSON_GetArrayItem(el_arr, m); + MLayerSourceConfig *ms = &xlcfg->mlayer_sources[m]; + + // scaling_mode per embedded layer + const cJSON *sm_item = + cJSON_GetObjectItemCaseSensitive(el, "scaling_mode"); + if (sm_item != NULL) scaling_modes_explicit = 1; + if (cJSON_IsNumber(sm_item)) { + xlcfg->scaling_mode[m] = sm_item->valueint; + } else if (cJSON_IsString(sm_item)) { + int mode = parse_scaling_mode(sm_item->valuestring); + if (mode < 0) { + fprintf( + stderr, + "Error: invalid scaling_mode \"%s\" for xlayer %d mlayer %d\n", + sm_item->valuestring, xlcfg->xlayer_id, m); + return -1; + } + xlcfg->scaling_mode[m] = mode; + } + + // input_source + const char *ml_isrc = json_get_string(el, "input_source", NULL); + if (ml_isrc) + snprintf(ms->input_source_name, MAX_SOURCE_NAME_LEN, "%s", ml_isrc); + + // crop coordinates + ms->atlas_pos_x = json_get_int(el, "atlas_pos_x", -1); + ms->atlas_pos_y = json_get_int(el, "atlas_pos_y", -1); + ms->width = (unsigned int)json_get_int(el, "width", 0); + ms->height = (unsigned int)json_get_int(el, "height", 0); + + // depends_on → dependency_mask + const cJSON *deps = cJSON_GetObjectItemCaseSensitive(el, "depends_on"); + if (cJSON_IsArray(deps)) { + ms->dependency_mask = 0; + const cJSON *dep = NULL; + cJSON_ArrayForEach(dep, deps) { + if (cJSON_IsNumber(dep) && dep->valueint >= 0 && dep->valueint < m) { + ms->dependency_mask |= (1 << dep->valueint); + } else if (cJSON_IsNumber(dep) && dep->valueint >= m) { + fprintf(stderr, + "Error: xlayer %d mlayer %d depends_on[%d] >= self\n", + xlcfg->xlayer_id, m, dep->valueint); + return -1; + } + } + xlcfg->has_mlayer_dependencies = 1; + } + + // Content Interpretation overrides (inherit from xlayer if omitted) + ms->color_primaries = json_get_int(el, "color_primaries", -1); + ms->transfer_characteristics = + json_get_int(el, "transfer_characteristics", -1); + ms->matrix_coefficients = json_get_int(el, "matrix_coefficients", -1); + ms->full_range_flag = json_get_int(el, "full_range_flag", -1); + + // Warn about unknown keys in this embedded layer entry + { + static const char *const el_known[] = { + "scaling_mode", "input_source", "atlas_pos_x", + "atlas_pos_y", "width", "height", + "depends_on", "color_primaries", "transfer_characteristics", + "matrix_coefficients", "full_range_flag", + }; + char el_section[64]; + snprintf(el_section, sizeof(el_section), + "xlayer %d embedded_layers[%d]", xlcfg->xlayer_id, m); + warn_unknown_keys(el, el_known, sizeof(el_known) / sizeof(el_known[0]), + el_section); + } + + if (ms->input_source_name[0] || ms->atlas_pos_x >= 0 || ms->width > 0) + xlcfg->has_per_mlayer_sources = 1; + } + } + + // Parse generic codec controls array: [["name", value], ...] + const cJSON *cc_arr = + cJSON_GetObjectItemCaseSensitive(entry, "codec_controls"); + if (cJSON_IsArray(cc_arr)) { + int n = cJSON_GetArraySize(cc_arr); + if (n > MAX_CODEC_CONTROLS) { + fprintf(stderr, "Error: xlayer %d has %d codec_controls (max %d)\n", + xlcfg->xlayer_id, n, MAX_CODEC_CONTROLS); + return -1; + } + xlcfg->num_codec_controls = n; + for (int c = 0; c < n; c++) { + const cJSON *pair = cJSON_GetArrayItem(cc_arr, c); + if (!cJSON_IsArray(pair) || cJSON_GetArraySize(pair) != 2) { + fprintf(stderr, + "Error: xlayer %d codec_controls[%d] must be [\"name\", " + "value]\n", + xlcfg->xlayer_id, c); + return -1; + } + const cJSON *name_item = cJSON_GetArrayItem(pair, 0); + const cJSON *val_item = cJSON_GetArrayItem(pair, 1); + if (!cJSON_IsString(name_item) || !cJSON_IsNumber(val_item)) { + fprintf(stderr, + "Error: xlayer %d codec_controls[%d] must be [string, " + "number]\n", + xlcfg->xlayer_id, c); + return -1; + } + snprintf(xlcfg->codec_controls[c].name, 64, "%s", name_item->valuestring); + xlcfg->codec_controls[c].value = val_item->valueint; + } + } + + // Derive default scaling modes when num_embedded_layers > 1 and none + // specified (all zeros) + if (!scaling_modes_explicit && xlcfg->num_embedded_layers > 1) { + int all_zero = 1; + for (int i = 0; i < xlcfg->num_embedded_layers; i++) { + if (xlcfg->scaling_mode[i] != 0) { + all_zero = 0; + break; + } + } + if (all_zero) { + // Default: smallest to full-res. Last layer is always AVME_NORMAL (0). + if (xlcfg->num_embedded_layers == 2) { + xlcfg->scaling_mode[0] = AVME_ONETWO; + } else if (xlcfg->num_embedded_layers >= 3) { + xlcfg->scaling_mode[0] = AVME_ONEFOUR; + xlcfg->scaling_mode[1] = AVME_ONETWO; + } + } + } + + // Warn about unknown keys + { + static const char *const known[] = { + "xlayer_id", + "input", + "input_source", + "width", + "height", + "profile", + "tier", + "level", + "layer_type", + "auxiliary_type", + "view_type", + "num_temporal_layers", + "num_embedded_layers", + "color_primaries", + "transfer_characteristics", + "matrix_coefficients", + "full_range_flag", + "qp", + "bitrate", + "cpu_used", + "lag_in_frames", + "sframe_dist", + "sframe_mode", + "sframe_type", + "kf_max_dist", + "subgop_config", + "gop_mode", + "fwd_kf_enabled", + "enable_keyframe_filtering", + "add_sef_for_hidden_frames", + "atlas_pos_x", + "atlas_pos_y", + "scaling_mode", + "embedded_layers", + "codec_controls", + }; + char section[64]; + snprintf(section, sizeof(section), "xlayer %d", xlcfg->xlayer_id); + warn_unknown_keys(entry, known, sizeof(known) / sizeof(known[0]), section); + } + + return 0; +} + +// Parse operating point xlayer_map from JSON array of xlayer IDs to bitmask +static uint32_t parse_xlayer_map_array(const cJSON *arr) { + uint32_t map = 0; + if (!cJSON_IsArray(arr)) return 0; + const cJSON *elem = NULL; + cJSON_ArrayForEach(elem, arr) { + if (cJSON_IsNumber(elem) && elem->valueint >= 0 && + elem->valueint < (int)MAX_NUM_XLAYERS) { + map |= (1u << (unsigned int)elem->valueint); + } + } + return map; +} + +// Parse a single OPS set from JSON +static int parse_ops_entry(const cJSON *entry, OPSConfig *ops_cfg) { + ops_cfg->enable = 1; + ops_cfg->ops_id = json_get_int(entry, "ops_id", 0); + ops_cfg->priority = json_get_int(entry, "priority", 0); + ops_cfg->intent_present_flag = json_get_bool(entry, "intent_present", 1); + ops_cfg->ptl_present_flag = json_get_bool(entry, "ptl_present", 1); + ops_cfg->color_info_present_flag = + json_get_bool(entry, "color_info_present", 0); + ops_cfg->mlayer_info_idc = json_get_int(entry, "mlayer_info_idc", 0); + + const cJSON *op_arr = + cJSON_GetObjectItemCaseSensitive(entry, "operating_points"); + if (!cJSON_IsArray(op_arr)) { + fprintf(stderr, "Error: OPS %d missing \"operating_points\" array\n", + ops_cfg->ops_id); + return -1; + } + + ops_cfg->num_operating_points = cJSON_GetArraySize(op_arr); + if (ops_cfg->num_operating_points > MAX_OPS_COUNT) { + fprintf(stderr, "Error: OPS %d has %d operating points (max %d)\n", + ops_cfg->ops_id, ops_cfg->num_operating_points, MAX_OPS_COUNT); + return -1; + } + + for (int i = 0; i < ops_cfg->num_operating_points; i++) { + const cJSON *op_entry = cJSON_GetArrayItem(op_arr, i); + OperatingPointConfig *op = &ops_cfg->ops[i]; + + op->intent = json_get_int(op_entry, "intent", 0); + const cJSON *xmap = + cJSON_GetObjectItemCaseSensitive(op_entry, "xlayer_map"); + op->xlayer_map = parse_xlayer_map_array(xmap); + + // Per-xlayer overrides within this OP + const cJSON *ml = + cJSON_GetObjectItemCaseSensitive(op_entry, "mlayer_count"); + if (cJSON_IsArray(ml)) { + int n = cJSON_GetArraySize(ml); + for (int j = 0; j < n && j < MAX_NUM_XLAYERS; j++) { + const cJSON *v = cJSON_GetArrayItem(ml, j); + if (cJSON_IsNumber(v)) op->mlayer_count[j] = v->valueint; + } + } + + op->aggregate_level_idx = json_get_int(op_entry, "aggregate_level_idx", -1); + op->max_tier_flag = json_get_int(op_entry, "max_tier_flag", -1); + + // Warn about unknown keys in this operating point + { + static const char *const op_known[] = { + "intent", "xlayer_map", "mlayer_count", "aggregate_level_idx", + "max_tier_flag", + }; + char op_section[64]; + snprintf(op_section, sizeof(op_section), "ops %d operating_points[%d]", + ops_cfg->ops_id, i); + warn_unknown_keys(op_entry, op_known, + sizeof(op_known) / sizeof(op_known[0]), op_section); + } + } + + // Warn about unknown keys in this OPS entry + { + static const char *const known[] = { + "ops_id", "priority", "intent_present", + "ptl_present", "color_info_present", "mlayer_info_idc", + "operating_points", + }; + char section[64]; + snprintf(section, sizeof(section), "ops %d", ops_cfg->ops_id); + warn_unknown_keys(entry, known, sizeof(known) / sizeof(known[0]), section); + } + + return 0; +} + +int parse_multi_xlayer_config(const char *json_path, MultiXLayerConfig *cfg) { + xlayer_config_init(cfg); + + char *json_str = read_file_contents(json_path); + if (!json_str) return -1; + + cJSON *root = cJSON_Parse(json_str); + free(json_str); + if (!root) { + fprintf(stderr, "Error: failed to parse JSON in \"%s\"\n", json_path); + return -1; + } + + // Parse xlayers array + const cJSON *xlayers = cJSON_GetObjectItemCaseSensitive(root, "xlayers"); + if (!cJSON_IsArray(xlayers)) { + fprintf(stderr, "Error: config missing \"xlayers\" array\n"); + cJSON_Delete(root); + return -1; + } + + cfg->num_xlayers = cJSON_GetArraySize(xlayers); + if (cfg->num_xlayers < 1 || cfg->num_xlayers > MAX_NUM_XLAYERS - 1) { + fprintf(stderr, "Error: num_xlayers %d out of range (1-%d)\n", + cfg->num_xlayers, MAX_NUM_XLAYERS - 1); + cJSON_Delete(root); + return -1; + } + + for (int i = 0; i < cfg->num_xlayers; i++) { + const cJSON *entry = cJSON_GetArrayItem(xlayers, i); + if (parse_xlayer_entry(entry, &cfg->xlayers[i]) != 0) { + cJSON_Delete(root); + return -1; + } + } + + // Parse global_lcr + const cJSON *lcr = cJSON_GetObjectItemCaseSensitive(root, "global_lcr"); + if (cJSON_IsObject(lcr)) { + cfg->enable_global_lcr = json_get_bool(lcr, "enable", 1); + cfg->lcr_purpose_id = json_get_int(lcr, "purpose_id", 0); + cfg->lcr_dependent_xlayers_flag = + json_get_bool(lcr, "dependent_xlayers", 0); + cfg->lcr_doh_constraint_flag = json_get_bool(lcr, "doh_constraint", 1); + static const char *const lcr_known[] = { + "enable", + "purpose_id", + "dependent_xlayers", + "doh_constraint", + }; + warn_unknown_keys(lcr, lcr_known, sizeof(lcr_known) / sizeof(lcr_known[0]), + "global_lcr"); + } + + // Parse local_lcr + const cJSON *local_lcr = cJSON_GetObjectItemCaseSensitive(root, "local_lcr"); + if (cJSON_IsObject(local_lcr)) { + cfg->enable_local_lcr = json_get_bool(local_lcr, "enable", 0); + const char *mode_str = json_get_string(local_lcr, "mode", "both"); + if (strcmp(mode_str, "local_only") == 0) + cfg->local_lcr_mode = 1; + else + cfg->local_lcr_mode = 0; + static const char *const ll_known[] = { "enable", "mode" }; + warn_unknown_keys(local_lcr, ll_known, + sizeof(ll_known) / sizeof(ll_known[0]), "local_lcr"); + } + + // Parse msdo + const cJSON *msdo = cJSON_GetObjectItemCaseSensitive(root, "msdo"); + if (cJSON_IsObject(msdo)) { + cfg->enable_msdo = json_get_bool(msdo, "enable", 0); + static const char *const msdo_known[] = { "enable" }; + warn_unknown_keys(msdo, msdo_known, + sizeof(msdo_known) / sizeof(msdo_known[0]), "msdo"); + } + + // Parse ops array + const cJSON *ops_arr = cJSON_GetObjectItemCaseSensitive(root, "ops"); + if (cJSON_IsArray(ops_arr)) { + cfg->num_ops_sets = cJSON_GetArraySize(ops_arr); + if (cfg->num_ops_sets > MAX_NUM_OPS_ID) cfg->num_ops_sets = MAX_NUM_OPS_ID; + for (int i = 0; i < cfg->num_ops_sets; i++) { + const cJSON *entry = cJSON_GetArrayItem(ops_arr, i); + if (parse_ops_entry(entry, &cfg->ops_sets[i]) != 0) { + cJSON_Delete(root); + return -1; + } + } + } + + // Parse atlas + const cJSON *atlas = cJSON_GetObjectItemCaseSensitive(root, "atlas"); + if (cJSON_IsObject(atlas)) { + cfg->enable_atlas = json_get_bool(atlas, "enable", 0); + cfg->atlas_mode = json_get_int(atlas, "mode", 0); + cfg->atlas_width = json_get_int(atlas, "width", 0); + cfg->atlas_height = json_get_int(atlas, "height", 0); + cfg->atlas_uniform_spacing = json_get_bool(atlas, "uniform_spacing", 1); + static const char *const atlas_known[] = { + "enable", "mode", "width", "height", "uniform_spacing", + }; + warn_unknown_keys(atlas, atlas_known, + sizeof(atlas_known) / sizeof(atlas_known[0]), "atlas"); + } + + // Parse named input sources array (new format) + const cJSON *inputs = cJSON_GetObjectItemCaseSensitive(root, "inputs"); + if (cJSON_IsArray(inputs)) { + int n = cJSON_GetArraySize(inputs); + if (n > MAX_INPUT_SOURCES) { + fprintf(stderr, "Error: too many input sources (%d > %d)\n", n, + MAX_INPUT_SOURCES); + cJSON_Delete(root); + return -1; + } + cfg->num_input_sources = n; + for (int i = 0; i < n; i++) { + const cJSON *inp = cJSON_GetArrayItem(inputs, i); + InputSourceConfig *src = &cfg->input_sources[i]; + const char *name = json_get_string(inp, "name", NULL); + if (name) snprintf(src->name, MAX_SOURCE_NAME_LEN, "%s", name); + const char *fn = json_get_string(inp, "filename", NULL); + if (fn) snprintf(src->filename, PATH_MAX, "%s", fn); + src->width = (unsigned int)json_get_int(inp, "width", 0); + src->height = (unsigned int)json_get_int(inp, "height", 0); + const char *fmt = json_get_string(inp, "format", NULL); + src->format = parse_chroma_format(fmt); + src->bit_depth = json_get_int(inp, "bit_depth", 0); + if (json_parse_frame_rate(inp, "frame_rate", &src->frame_rate_num, + &src->frame_rate_den) != 0) { + cJSON_Delete(root); + return -1; + } + static const char *const inp_known[] = { + "name", "filename", "width", "height", + "format", "bit_depth", "frame_rate", + }; + char inp_section[64]; + snprintf(inp_section, sizeof(inp_section), "inputs[%d]", i); + warn_unknown_keys(inp, inp_known, + sizeof(inp_known) / sizeof(inp_known[0]), inp_section); + } + } + + // Parse shared source (legacy single-source format) + const cJSON *source = cJSON_GetObjectItemCaseSensitive(root, "source"); + if (cJSON_IsObject(source)) { + if (cfg->num_input_sources > 0) { + fprintf(stderr, "Error: cannot specify both \"inputs\" and \"source\"\n"); + cJSON_Delete(root); + return -1; + } + const char *src_file = json_get_string(source, "filename", NULL); + if (src_file) { + snprintf(cfg->source_filename, PATH_MAX, "%s", src_file); + } + cfg->source_width = (unsigned int)json_get_int(source, "width", 0); + cfg->source_height = (unsigned int)json_get_int(source, "height", 0); + // Convert to input_sources[0] for unified handling + cfg->num_input_sources = 1; + InputSourceConfig *src = &cfg->input_sources[0]; + snprintf(src->name, MAX_SOURCE_NAME_LEN, "default"); + if (src_file) snprintf(src->filename, PATH_MAX, "%s", src_file); + src->width = cfg->source_width; + src->height = cfg->source_height; + static const char *const src_known[] = { "filename", "width", "height" }; + warn_unknown_keys(source, src_known, + sizeof(src_known) / sizeof(src_known[0]), "source"); + } + + // Parse bitstream options + cfg->combined_tu = json_get_bool(root, "combined_tu", 1); + cfg->monotonic_output_order = + json_get_bool(root, "monotonic_output_order", 0); + + // Parse frame rate (used for aggregate level derivation) + { + const cJSON *fps = cJSON_GetObjectItemCaseSensitive(root, "frame_rate"); + if (cJSON_IsNumber(fps)) cfg->frame_rate = fps->valuedouble; + } + + // Parse limit (max frames to encode) + cfg->limit = json_get_int(root, "limit", 0); + + const char *output = json_get_string(root, "output", NULL); + if (output) { + snprintf(cfg->output_filename, PATH_MAX, "%s", output); + } + + // Warn about unknown root-level keys + { + static const char *const known[] = { + "xlayers", "global_lcr", + "local_lcr", "msdo", + "ops", "atlas", + "inputs", "source", + "combined_tu", "monotonic_output_order", + "frame_rate", "limit", + "output", + }; + warn_unknown_keys(root, known, sizeof(known) / sizeof(known[0]), "root"); + } + + cJSON_Delete(root); + return 0; +} + +// Look up an input source by name. Returns its index, or -1 if not found. +static int find_input_source_by_name(const MultiXLayerConfig *cfg, + const char *name) { + for (int s = 0; s < cfg->num_input_sources; s++) { + if (strcmp(name, cfg->input_sources[s].name) == 0) return s; + } + return -1; +} + +int resolve_input_sources(MultiXLayerConfig *cfg) { + for (int i = 0; i < cfg->num_xlayers; i++) { + XLayerEncConfig *xl = &cfg->xlayers[i]; + xl->input_source_idx = -1; // default: own file + + // Skip xlayers with their own input file + if (xl->input_filename[0] != '\0') continue; + + if (xl->input_source_name[0] != '\0') { + // Explicit source reference — look up by name + xl->input_source_idx = + find_input_source_by_name(cfg, xl->input_source_name); + if (xl->input_source_idx < 0) { + fprintf(stderr, + "Error: xlayer %d references unknown input_source \"%s\"\n", + xl->xlayer_id, xl->input_source_name); + return -1; + } + } else if (cfg->num_input_sources == 1) { + // Single input source — all unassigned xlayers use it + xl->input_source_idx = 0; + } else if (cfg->num_input_sources > 1) { + fprintf(stderr, + "Error: xlayer %d has no input or input_source, and multiple " + "input sources are defined\n", + xl->xlayer_id); + return -1; + } + } + + // Resolve per-mlayer input sources + for (int i = 0; i < cfg->num_xlayers; i++) { + XLayerEncConfig *xl = &cfg->xlayers[i]; + if (!xl->has_per_mlayer_sources) continue; + for (int m = 0; m < xl->num_embedded_layers; m++) { + MLayerSourceConfig *ms = &xl->mlayer_sources[m]; + if (ms->input_source_name[0] == '\0') { + // Inherit from xlayer + ms->input_source_idx = xl->input_source_idx; + if (ms->atlas_pos_x < 0) ms->atlas_pos_x = xl->atlas_pos_x; + if (ms->atlas_pos_y < 0) ms->atlas_pos_y = xl->atlas_pos_y; + if (ms->width == 0) ms->width = xl->width; + if (ms->height == 0) ms->height = xl->height; + continue; + } + // Look up source by name + ms->input_source_idx = + find_input_source_by_name(cfg, ms->input_source_name); + if (ms->input_source_idx < 0) { + fprintf(stderr, + "Error: xlayer %d mlayer %d references unknown input_source " + "\"%s\"\n", + xl->xlayer_id, m, ms->input_source_name); + return -1; + } + // Fill in crop defaults if not specified + if (ms->atlas_pos_x < 0) ms->atlas_pos_x = 0; + if (ms->atlas_pos_y < 0) ms->atlas_pos_y = 0; + } + } + + // Resolve frame_skip for each input source based on frame rates. + // The master rate is the highest frame rate among all sources. + // Each source's rate must be an exact integer divisor of the master rate. + // Frame rates are rational (num/den) for exact arithmetic. + { + // Find the maximum frame rate using rational comparison: a/b > c/d iff + // a*d > c*b + int max_idx = -1; + for (int s = 0; s < cfg->num_input_sources; s++) { + if (cfg->input_sources[s].frame_rate_num <= 0) continue; + if (max_idx < 0) { + max_idx = s; + } else { + int64_t lhs = (int64_t)cfg->input_sources[s].frame_rate_num * + cfg->input_sources[max_idx].frame_rate_den; + int64_t rhs = (int64_t)cfg->input_sources[max_idx].frame_rate_num * + cfg->input_sources[s].frame_rate_den; + if (lhs > rhs) max_idx = s; + } + } + + if (max_idx >= 0) { + int max_n = cfg->input_sources[max_idx].frame_rate_num; + int max_d = cfg->input_sources[max_idx].frame_rate_den; + + for (int s = 0; s < cfg->num_input_sources; s++) { + int src_n = cfg->input_sources[s].frame_rate_num; + int src_d = cfg->input_sources[s].frame_rate_den; + if (src_n <= 0) { + // Unspecified — assume master rate + cfg->input_sources[s].frame_skip = 1; + continue; + } + // Ratio = (max_n/max_d) / (src_n/src_d) = (max_n * src_d) / + // (max_d * src_n) + int64_t ratio_num = (int64_t)max_n * src_d; + int64_t ratio_den = (int64_t)max_d * src_n; + // Must be an exact integer (ratio_den divides ratio_num evenly) + if (ratio_den == 0 || ratio_num % ratio_den != 0) { + fprintf(stderr, + "Error: input source \"%s\" frame_rate %d/%d is not an " + "exact divisor of the max frame_rate %d/%d\n", + cfg->input_sources[s].name, src_n, src_d, max_n, max_d); + return -1; + } + int skip = (int)(ratio_num / ratio_den); + if (skip < 1) { + fprintf(stderr, + "Error: input source \"%s\" frame_rate %d/%d exceeds the " + "max frame_rate %d/%d\n", + cfg->input_sources[s].name, src_n, src_d, max_n, max_d); + return -1; + } + cfg->input_sources[s].frame_skip = skip; + } + } else { + // No frame rates specified — all sources run at same rate + for (int s = 0; s < cfg->num_input_sources; s++) + cfg->input_sources[s].frame_skip = 1; + } + } + + return 0; +} + +// Resolve per-mlayer CI inheritance: if an mlayer's CI field is -1, inherit +// from the parent xlayer's value. Must be called after resolve_input_sources(). +void resolve_mlayer_ci(MultiXLayerConfig *cfg) { + for (int i = 0; i < cfg->num_xlayers; i++) { + XLayerEncConfig *xl = &cfg->xlayers[i]; + for (int m = 0; m < xl->num_embedded_layers; m++) { + MLayerSourceConfig *ms = &xl->mlayer_sources[m]; + if (ms->color_primaries == -1) ms->color_primaries = xl->color_primaries; + if (ms->transfer_characteristics == -1) + ms->transfer_characteristics = xl->transfer_characteristics; + if (ms->matrix_coefficients == -1) + ms->matrix_coefficients = xl->matrix_coefficients; + if (ms->full_range_flag == -1) ms->full_range_flag = xl->full_range_flag; + } + } +} + +int validate_multi_xlayer_config(const MultiXLayerConfig *cfg) { + if (cfg->num_xlayers < 1) { + fprintf(stderr, "Error: must have at least 1 xlayer\n"); + return -1; + } + + // Check xlayer_ids are unique and in range + int seen[MAX_NUM_XLAYERS] = { 0 }; + for (int i = 0; i < cfg->num_xlayers; i++) { + int id = cfg->xlayers[i].xlayer_id; + if (id < 0 || id > 30) { + fprintf(stderr, "Error: xlayer %d has invalid xlayer_id %d\n", i, id); + return -1; + } + if (seen[id]) { + fprintf(stderr, "Error: duplicate xlayer_id %d\n", id); + return -1; + } + seen[id] = 1; + } + + // Validate input source names are unique and non-empty + for (int i = 0; i < cfg->num_input_sources; i++) { + if (cfg->input_sources[i].name[0] == '\0') { + fprintf(stderr, "Error: input source %d has no name\n", i); + return -1; + } + for (int j = i + 1; j < cfg->num_input_sources; j++) { + if (strcmp(cfg->input_sources[i].name, cfg->input_sources[j].name) == 0) { + fprintf(stderr, "Error: duplicate input source name \"%s\"\n", + cfg->input_sources[i].name); + return -1; + } + } + } + + // Validate each xlayer has input (or input source) and dimensions + int has_shared_source = + (cfg->source_filename[0] != '\0' || cfg->num_input_sources > 0); + for (int i = 0; i < cfg->num_xlayers; i++) { + const XLayerEncConfig *xl = &cfg->xlayers[i]; + if (xl->input_filename[0] == '\0' && xl->input_source_idx < 0 && + !has_shared_source) { + fprintf(stderr, + "Error: xlayer %d missing input filename and no shared source\n", + xl->xlayer_id); + return -1; + } + // When using a shared/named input source, atlas positions and dimensions + // are required + if (xl->input_source_idx >= 0) { + if (xl->atlas_pos_x < 0 || xl->atlas_pos_y < 0) { + fprintf(stderr, + "Error: xlayer %d requires atlas_pos_x/y when using input " + "source\n", + xl->xlayer_id); + return -1; + } + if (xl->width == 0 || xl->height == 0) { + fprintf(stderr, + "Error: xlayer %d requires width/height when using input " + "source\n", + xl->xlayer_id); + return -1; + } + } + } + + // Per-source-group chroma validation: xlayers sharing the same input source + // must have the same chroma format (profile determines chroma) + for (int s = 0; s < cfg->num_input_sources; s++) { + int ref_chroma = -1; + int ref_xlayer_id = -1; + unsigned int ref_profile = 0; + for (int i = 0; i < cfg->num_xlayers; i++) { + if (cfg->xlayers[i].input_source_idx != s) continue; + int chroma = (cfg->xlayers[i].profile <= MAIN_420_10_IP1) ? 0 + : (cfg->xlayers[i].profile == MAIN_422_10_IP1) ? 1 + : 2; + if (ref_chroma < 0) { + ref_chroma = chroma; + ref_xlayer_id = cfg->xlayers[i].xlayer_id; + ref_profile = cfg->xlayers[i].profile; + } else if (chroma != ref_chroma) { + fprintf(stderr, + "Error: xlayers sharing input source \"%s\" must use the " + "same chroma format (xlayer %d profile %u vs xlayer %d " + "profile %u)\n", + cfg->input_sources[s].name, ref_xlayer_id, ref_profile, + cfg->xlayers[i].xlayer_id, cfg->xlayers[i].profile); + return -1; + } + } + } + + // Validate OPS operating points reference valid xlayer_ids + for (int s = 0; s < cfg->num_ops_sets; s++) { + const OPSConfig *ops = &cfg->ops_sets[s]; + if (!ops->enable) continue; + for (int p = 0; p < ops->num_operating_points; p++) { + uint32_t xmap = ops->ops[p].xlayer_map; + for (int bit = 0; bit < 31; bit++) { + if (xmap & (1u << bit)) { + if (!seen[bit]) { + fprintf(stderr, + "Error: OPS %d OP %d references xlayer_id %d which is " + "not in the config\n", + ops->ops_id, p, bit); + return -1; + } + } + } + } + } + + // When monotonic_output_order is disabled, all xlayers must use the same + // coding structure (temporal layers, lag-in-frames, keyframe interval, + // sub-GOP config, and GOP mode) so that their output ordering is + // synchronized. + if (!cfg->monotonic_output_order && cfg->num_xlayers > 1) { + const XLayerEncConfig *ref = &cfg->xlayers[0]; + for (int i = 1; i < cfg->num_xlayers; i++) { + const XLayerEncConfig *xl = &cfg->xlayers[i]; + if (xl->num_temporal_layers != ref->num_temporal_layers) { + fprintf(stderr, + "Error: monotonic_output_order=0 requires all xlayers to use " + "the same num_temporal_layers (xlayer %d has %d, xlayer %d has " + "%d)\n", + ref->xlayer_id, ref->num_temporal_layers, xl->xlayer_id, + xl->num_temporal_layers); + return -1; + } + if (xl->lag_in_frames != ref->lag_in_frames) { + fprintf(stderr, + "Error: monotonic_output_order=0 requires all xlayers to use " + "the same lag_in_frames (xlayer %d has %d, xlayer %d has %d)\n", + ref->xlayer_id, ref->lag_in_frames, xl->xlayer_id, + xl->lag_in_frames); + return -1; + } + if (xl->kf_max_dist != ref->kf_max_dist) { + fprintf(stderr, + "Error: monotonic_output_order=0 requires all xlayers to use " + "the same kf_max_dist (xlayer %d has %d, xlayer %d has %d)\n", + ref->xlayer_id, ref->kf_max_dist, xl->xlayer_id, + xl->kf_max_dist); + return -1; + } + if (strcmp(xl->subgop_config_path, ref->subgop_config_path) != 0) { + fprintf(stderr, + "Error: monotonic_output_order=0 requires all xlayers to use " + "the same subgop_config (xlayer %d has \"%s\", xlayer %d has " + "\"%s\")\n", + ref->xlayer_id, ref->subgop_config_path, xl->xlayer_id, + xl->subgop_config_path); + return -1; + } + if (xl->gop_mode != ref->gop_mode) { + fprintf(stderr, + "Error: monotonic_output_order=0 requires all xlayers to use " + "the same gop_mode (xlayer %d has %d, xlayer %d has %d)\n", + ref->xlayer_id, ref->gop_mode, xl->xlayer_id, xl->gop_mode); + return -1; + } + } + } + + // Validate embedded layer configuration + for (int i = 0; i < cfg->num_xlayers; i++) { + const XLayerEncConfig *xl = &cfg->xlayers[i]; + if (xl->num_embedded_layers < 1 || + xl->num_embedded_layers > MAX_NUM_MLAYERS) { + fprintf(stderr, + "Error: xlayer %d num_embedded_layers %d out of range (1-%d)\n", + xl->xlayer_id, xl->num_embedded_layers, MAX_NUM_MLAYERS); + return -1; + } + if (xl->num_embedded_layers > 1) { + // Last layer must be full-resolution (AVME_NORMAL = 0) + if (xl->scaling_mode[xl->num_embedded_layers - 1] != AVME_NORMAL) { + fprintf(stderr, + "Error: xlayer %d scaling_mode[%d] must be 0 (full-res) for " + "the last embedded layer\n", + xl->xlayer_id, xl->num_embedded_layers - 1); + return -1; + } + // Validate all scaling mode values are in range + for (int m = 0; m < xl->num_embedded_layers; m++) { + if (xl->scaling_mode[m] < AVME_NORMAL || + xl->scaling_mode[m] > AVME_ONETWO) { + fprintf(stderr, + "Error: xlayer %d scaling_mode[%d]=%d out of range (0-%d)\n", + xl->xlayer_id, m, xl->scaling_mode[m], AVME_ONETWO); + return -1; + } + } + } + } + + // Validate per-embedded-layer source configuration + for (int i = 0; i < cfg->num_xlayers; i++) { + const XLayerEncConfig *xl = &cfg->xlayers[i]; + if (!xl->has_per_mlayer_sources && !xl->has_mlayer_dependencies) continue; + for (int m = 0; m < xl->num_embedded_layers; m++) { + const MLayerSourceConfig *ms = &xl->mlayer_sources[m]; + // Per-mlayer source requires width, height, and crop coordinates + if (ms->input_source_name[0] != '\0' || ms->input_source_idx >= 0) { + if (ms->width == 0 || ms->height == 0) { + fprintf(stderr, + "Error: xlayer %d mlayer %d requires width/height when " + "using per-mlayer input source\n", + xl->xlayer_id, m); + return -1; + } + if (ms->atlas_pos_x < 0 || ms->atlas_pos_y < 0) { + fprintf(stderr, + "Error: xlayer %d mlayer %d requires atlas_pos_x/y when " + "using per-mlayer input source\n", + xl->xlayer_id, m); + return -1; + } + } + // dependency_mask validation + if (ms->dependency_mask >= 0) { + if (m == 0 && ms->dependency_mask != 0) { + fprintf(stderr, + "Error: xlayer %d mlayer 0 cannot depend on any lower " + "mlayer (depends_on must be empty)\n", + xl->xlayer_id); + return -1; + } + // Check all set bits reference valid lower mlayer indices + for (int j = m; j < MAX_NUM_MLAYERS; j++) { + if (ms->dependency_mask & (1 << j)) { + fprintf(stderr, + "Error: xlayer %d mlayer %d depends_on references " + "mlayer %d (must be < %d)\n", + xl->xlayer_id, m, j, m); + return -1; + } + } + } + } + } + + // Validate per-mlayer CI values are in valid CICP ranges + for (int i = 0; i < cfg->num_xlayers; i++) { + const XLayerEncConfig *xl = &cfg->xlayers[i]; + for (int m = 0; m < xl->num_embedded_layers; m++) { + const MLayerSourceConfig *ms = &xl->mlayer_sources[m]; + if (ms->color_primaries != -1 && + (ms->color_primaries < 0 || ms->color_primaries > 255)) { + fprintf(stderr, + "Error: xlayer %d mlayer %d color_primaries %d out of range " + "(0-255)\n", + xl->xlayer_id, m, ms->color_primaries); + return -1; + } + if (ms->transfer_characteristics != -1 && + (ms->transfer_characteristics < 0 || + ms->transfer_characteristics > 255)) { + fprintf(stderr, + "Error: xlayer %d mlayer %d transfer_characteristics %d out " + "of range (0-255)\n", + xl->xlayer_id, m, ms->transfer_characteristics); + return -1; + } + if (ms->matrix_coefficients != -1 && + (ms->matrix_coefficients < 0 || ms->matrix_coefficients > 255)) { + fprintf(stderr, + "Error: xlayer %d mlayer %d matrix_coefficients %d out of " + "range (0-255)\n", + xl->xlayer_id, m, ms->matrix_coefficients); + return -1; + } + if (ms->full_range_flag != -1 && + (ms->full_range_flag < 0 || ms->full_range_flag > 1)) { + fprintf(stderr, + "Error: xlayer %d mlayer %d full_range_flag %d must be 0 or " + "1\n", + xl->xlayer_id, m, ms->full_range_flag); + return -1; + } + } + } + + // Validate GOP mode constraints + if (cfg->monotonic_output_order) { + for (int i = 0; i < cfg->num_xlayers; i++) { + if (cfg->xlayers[i].gop_mode == 1) { + fprintf(stderr, + "Error: gop_mode \"open_leading\" is not allowed with " + "monotonic_output_order=1 (xlayer %d). Leading OBUs require " + "non-monotonic output.\n", + cfg->xlayers[i].xlayer_id); + return -1; + } + } + } + + return 0; +} diff --git a/common/xlayer_config_parse.h b/common/xlayer_config_parse.h new file mode 100644 index 0000000000..325d830a20 --- /dev/null +++ b/common/xlayer_config_parse.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#ifndef AVM_COMMON_XLAYER_CONFIG_PARSE_H_ +#define AVM_COMMON_XLAYER_CONFIG_PARSE_H_ + +#include "common/xlayer_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Parse a JSON configuration file for multi-xlayer encoding. +// Returns 0 on success, -1 on error (with message printed to stderr). +int parse_multi_xlayer_config(const char *json_path, MultiXLayerConfig *cfg); + +// Resolve input_source_idx for each xlayer after parsing. +// Must be called between parse and validate. +// Returns 0 on success, -1 on error. +int resolve_input_sources(MultiXLayerConfig *cfg); + +// Resolve per-mlayer CI inheritance: if an mlayer's CI field is -1, inherit +// from the parent xlayer's value. Must be called after +// resolve_input_sources(). +void resolve_mlayer_ci(MultiXLayerConfig *cfg); + +// Validate a parsed multi-xlayer configuration. +// Returns 0 on success, -1 on error. +int validate_multi_xlayer_config(const MultiXLayerConfig *cfg); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AVM_COMMON_XLAYER_CONFIG_PARSE_H_ diff --git a/doc/multi_xlayer_encoding.md b/doc/multi_xlayer_encoding.md new file mode 100644 index 0000000000..98a3454d78 --- /dev/null +++ b/doc/multi_xlayer_encoding.md @@ -0,0 +1,1255 @@ +# Multi-XLayer Encoding Guide + +This document describes how to use AVM's multi-xlayer encoding framework +to encode multiple extended layers (xlayers) into a single combined +bitstream. Xlayers enable use cases such as texture+depth, stereo video, +subpicture tiling, and spatially scalable encoding with embedded layers. + +## Table of Contents + +- [Quick Start](#quick-start) +- [CLI Usage](#cli-usage) +- [JSON Configuration Reference](#json-configuration-reference) + - [Top-Level Fields](#top-level-fields) + - [XLayer Entry Fields](#xlayer-entry-fields) + - [Embedded Layers (MLayers)](#embedded-layers-mlayers) + - [Per-Embedded-Layer Configuration](#per-embedded-layer-configuration) + - [Global LCR](#global-lcr) + - [Local LCR](#local-lcr) + - [OPS (Operating Point Set)](#ops-operating-point-set) + - [Atlas](#atlas) + - [Input Sources](#input-sources) + - [Codec Controls](#codec-controls) + - [GOP Modes](#gop-modes) +- [Use Cases and Examples](#use-cases-and-examples) + - [Texture + Depth](#texture--depth) + - [Stereo Video](#stereo-video) + - [Subpicture Tiling](#subpicture-tiling) + - [Subpicture with Auxiliary Layers](#subpicture-with-auxiliary-layers) + - [Spatial Scalability with Embedded Layers](#spatial-scalability-with-embedded-layers) + - [Mixed Embedded Layer Counts](#mixed-embedded-layer-counts) + - [Stereo via Embedded Layers](#stereo-via-embedded-layers) + - [Subpicture Tiling via Embedded Layers](#subpicture-tiling-via-embedded-layers) + - [Texture + Depth via Embedded Layers with XLayers](#texture--depth-via-embedded-layers-with-xlayers) +- [GOP Mode and Output Order](#gop-mode-and-output-order) + - [Compatibility Matrix](#compatibility-matrix) + - [Closed GOP, Non-Monotonic (Multi-XLayer + Multi-MLayer)](#closed-gop-non-monotonic-multi-xlayer--multi-mlayer) + - [Closed GOP, Monotonic (Multi-XLayer + Multi-MLayer)](#closed-gop-monotonic-multi-xlayer--multi-mlayer) + - [Open Leading, Non-Monotonic (Multi-XLayer + Multi-MLayer)](#open-leading-non-monotonic-multi-xlayer--multi-mlayer) + - [Open SEF, Monotonic (Multi-XLayer + Multi-MLayer)](#open-sef-monotonic-multi-xlayer--multi-mlayer) +- [Decoding](#decoding) +- [Stream Demuxing](#stream-demuxing) +- [Constraints and Validation](#constraints-and-validation) + +--- + +## Quick Start + +1. Create a JSON configuration file describing your xlayers. +2. Encode with `avmenc --xlayer-config config.json`. +3. Decode with `avmdec --all-layers` to get all layers. + +Minimal two-layer example: + +```json +{ + "xlayers": [ + { "xlayer_id": 0, "input": "texture.y4m", "width": 1920, "height": 1080, + "qp": 128, "cpu_used": 5 }, + { "xlayer_id": 1, "input": "depth.y4m", "width": 1920, "height": 1080, + "layer_type": "auxiliary", "auxiliary_type": "depth", + "qp": 160, "cpu_used": 5 } + ], + "output": "combined.obu" +} +``` + +```bash +avmenc --xlayer-config two_layer.json --limit=30 +avmdec --all-layers -o decoded.y4m combined.obu +``` + +--- + +## CLI Usage + +### Encoder + +Multi-xlayer encoding is triggered by passing `--xlayer-config`: + +```bash +avmenc --xlayer-config [--limit=N] [--framerate=N/D] +``` + +When `--xlayer-config` is provided, the encoder ignores the normal +single-stream arguments (input file, `--width`, `--height`, etc.) and +reads all configuration from the JSON file. Standard arguments that are +still honored: + +| Argument | Effect | +|----------|--------| +| `--limit=N` | Encode at most N source frames | +| `--framerate=N/D` | Override timebase for all xlayers | + +### Decoder + +```bash +avmdec --all-layers -o output.y4m input.obu +avmdec --all-layers --num-streams=N -o output_%d.y4m input.obu +avmdec --all-layers --atlas-composite --xlayer-config config.json -o composite.y4m input.obu +``` + +| Flag | Purpose | +|------|---------| +| `--all-layers` | Output all decoded frames (all xlayers, all mlayers) | +| `--num-streams=N` | Split output into N separate files (`output_0.y4m`, `output_1.y4m`, ...) | +| `--xlayer-config` | Provide atlas layout for `--atlas-composite` | +| `--atlas-composite` | Composite decoded xlayers onto an atlas canvas using the layout from the config | + +--- + +## JSON Configuration Reference + +### Top-Level Fields + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `xlayers` | array | *required* | Array of xlayer entries (1-31) | +| `inputs` | array | `[]` | Named input sources (see [Input Sources](#input-sources)) | +| `source` | object | | Legacy single shared source (converted to `inputs[0]` internally) | +| `output` | string | `""` | Output bitstream path | +| `combined_tu` | bool | `true` | Combine all xlayer OBUs into shared TUs | +| `monotonic_output_order` | bool | `false` | Encoder outputs frames in monotonic order | +| `frame_rate` | number | `0` | Frame rate for aggregate level derivation (0 = use encoder timebase) | + +### XLayer Entry Fields + +Each entry in the `xlayers` array configures one extended layer: + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `xlayer_id` | int | *required* | Unique ID, 0-30 | +| `input` | string | `""` | Input file path (Y4M or raw YUV). Not needed if using `input_source` or single `inputs` entry. | +| `input_source` | string | `""` | Reference to a named input source from `inputs` array. | +| `width` | int | 0 | Frame width (required for raw YUV or shared source) | +| `height` | int | 0 | Frame height | +| `profile` | int | 3 | AV2 profile (0-3 = Main 4:2:0 10-bit variants, 4 = Main 4:2:2 10-bit, 5 = Main 4:4:4 10-bit) | +| `tier` | int | 0 | Tier | +| `level` | int | 16 | Level index (e.g. 16 = Level 4.0) | +| `layer_type` | string | `"texture"` | `"texture"`, `"auxiliary"`, `"stereo"`, or `"dependent"` | +| `auxiliary_type` | string | `"alpha"` | Only when `layer_type` is `"auxiliary"`: `"alpha"`, `"depth"`, `"segmentation"`, `"gain_map"` | +| `view_type` | string | `"unspecified"` | `"unspecified"`, `"center"`, `"left"`, `"right"`, `"explicit"` | +| `qp` | int | -1 | Fixed QP (0-255). -1 = use global default. | +| `bitrate` | int | -1 | Target bitrate in kbps. -1 = use QP mode. | +| `cpu_used` | int | -1 | Encoder speed preset (0=slowest, 9=fastest). -1 = default (5). | +| `lag_in_frames` | int | -1 | Lookahead buffer size. -1 = default. | +| `kf_max_dist` | int | -1 | Maximum keyframe interval. -1 = default. | +| `subgop_config` | string | `""` | Path to sub-GOP JSON config file | +| `gop_mode` | string | `"closed"` | `"closed"`, `"open_leading"`, or `"open_sef"` | +| `fwd_kf_enabled` | int | -1 | Forward keyframe override. -1 = derive from `gop_mode`. | +| `enable_keyframe_filtering` | int | -1 | KF filtering override. -1 = derive. | +| `add_sef_for_hidden_frames` | int | -1 | SEF for hidden frames override. -1 = derive. | +| `num_temporal_layers` | int | 1 | Number of temporal layers (1-8) | +| `num_embedded_layers` | int | 1 | Number of spatial embedded layers (1-8) | +| `scaling_mode` | array | auto | Scaling mode per embedded layer (see [Embedded Layers](#embedded-layers-mlayers)) | +| `embedded_layers` | array | | Per-embedded-layer configuration (see [Per-Embedded-Layer Configuration](#per-embedded-layer-configuration)) | +| `color_primaries` | int | -1 | Color primaries. -1 = not signaled. | +| `transfer_characteristics` | int | -1 | Transfer characteristics. -1 = not signaled. | +| `matrix_coefficients` | int | -1 | Matrix coefficients. -1 = not signaled. | +| `full_range_flag` | int | -1 | Full range flag. -1 = not signaled. | +| `atlas_pos_x` | int | -1 | X position in atlas canvas (required for shared source mode) | +| `atlas_pos_y` | int | -1 | Y position in atlas canvas | +| `codec_controls` | array | `[]` | Generic codec controls applied after encoder init (see [Codec Controls](#codec-controls)) | + +### Embedded Layers (MLayers) + +Each xlayer can independently encode multiple spatial embedded layers +(mlayers). The encoder is called once per mlayer for each source frame, +with the appropriate scaling mode and mlayer ID set before each call. +The encoder internally rescales the source image. + +**Configuration:** + +```json +{ + "xlayer_id": 0, + "num_embedded_layers": 3, + "scaling_mode": ["1/4", "1/2", "1:1"] +} +``` + +The `scaling_mode` array specifies the spatial scale for each embedded +layer, from smallest to largest. The last entry must always be `"1:1"` +(full resolution). + +**Scaling mode values:** + +| String | Integer | Scale Factor | +|--------|---------|-------------| +| `"1:1"` or `"normal"` | 0 | Full resolution | +| `"4/5"` | 1 | 4/5 scale | +| `"3/5"` | 2 | 3/5 scale | +| `"3/4"` | 3 | 3/4 scale | +| `"1/4"` | 4 | 1/4 scale | +| `"1/8"` | 5 | 1/8 scale | +| `"1/2"` | 6 | 1/2 scale | + +Both string and integer values are accepted in JSON. + +**Default derivation:** When `num_embedded_layers > 1` and `scaling_mode` +is omitted, defaults are derived automatically: + +| Layers | Default `scaling_mode` | +|--------|----------------------| +| 2 | `["1/2", "1:1"]` | +| 3 | `["1/4", "1/2", "1:1"]` | + +**LCR signaling:** For each non-full-resolution mlayer, the LCR OBU +signals `lcr_same_sh_max_resolution_flag = 0` with +`lcr_max_expected_width/height` set to the xlayer's full resolution +(not the scaled size). This is because the encoder may produce +full-resolution frames (e.g., on keyframes), so the LCR must signal +the maximum possible dimensions. Full-resolution mlayers signal +`lcr_same_sh_max_resolution_flag = 1`. + +#### Per-Embedded-Layer Configuration + +When different embedded layers need genuinely different input content +(e.g., stereo views, subpicture tiles, overlay+base), use the +`"embedded_layers"` array to configure each mlayer independently. + +```json +{ + "xlayer_id": 0, + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", "input_source": "left", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "right", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [0] } + ] +} +``` + +Each entry in `"embedded_layers"` corresponds to one mlayer (in order). +All fields are optional — omitted fields inherit from the parent xlayer. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `scaling_mode` | string/int | inherit | Encoder-internal scaling for this mlayer | +| `input_source` | string | inherit | Named input source for this mlayer | +| `atlas_pos_x` | int | inherit | Crop origin X within source | +| `atlas_pos_y` | int | inherit | Crop origin Y within source | +| `width` | int | inherit | Crop width | +| `height` | int | inherit | Crop height | +| `depends_on` | array of int | default linear | Which lower mlayer indices this depends on for inter-layer prediction | +| `color_primaries` | int | inherit | CICP color primaries (0-255), inherits from xlayer if omitted | +| `transfer_characteristics` | int | inherit | CICP transfer characteristics (0-255), inherits from xlayer if omitted | +| `matrix_coefficients` | int | inherit | CICP matrix coefficients (0-255), inherits from xlayer if omitted | +| `full_range_flag` | int | inherit | 0=limited range, 1=full range, inherits from xlayer if omitted | + +**`depends_on` semantics:** +- Absent: default linear chain (each mlayer depends on all lower mlayers) +- `[]`: independent (no inter-layer prediction) +- `[0]`: depends only on mlayer 0 +- `[0, 1]`: depends on mlayers 0 and 1 + +**Mutual exclusion:** `"embedded_layers"` and the flat `"scaling_mode"` +array cannot both be present on the same xlayer entry. Use one or the +other. + +**Backward compatibility:** When `"embedded_layers"` is absent, all +behavior is unchanged — the existing flat `"scaling_mode"` array and +default scaling derivation work as before. + +#### Content Interpretation (CI) Per MLayer + +Each embedded layer can have its own Content Interpretation (CI) OBU with +distinct CICP color properties. This is useful when different mlayers carry +content with different color characteristics (e.g., HDR base layer with +SDR enhancement, or depth with different matrix coefficients). + +**Inheritance rules:** +1. If an mlayer omits a CI field (or sets it to `-1`), it inherits + from the parent xlayer's value. +2. At the bitstream level, if an mlayer's resolved CI is identical to its + first dependent layer's CI (via `depends_on`), the CI OBU is omitted — + the decoder inherits automatically. +3. CI is written at every random access point (CLK) for each mlayer that + has distinct CI. +4. CI must not change within a coded video sequence (CVS). + +**Example:** Stereo with different color primaries per view: +```json +{ + "xlayer_id": 0, + "color_primaries": 1, + "transfer_characteristics": 1, + "matrix_coefficients": 1, + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", "color_primaries": 9, + "transfer_characteristics": 16, "matrix_coefficients": 9 }, + { "scaling_mode": "1:1" } + ] +} +``` + +In this example, mlayer 0 uses BT.2020/PQ (CICP 9/16/9) while mlayer 1 +inherits BT.709 (CICP 1/1/1) from the xlayer. Each gets its own CI OBU +in the bitstream with the correct `obu_layer` value. + +### Global LCR + +The Global Layer Configuration Record describes the overall multi-layer +structure in the bitstream. + +```json +"global_lcr": { + "enable": true, + "purpose_id": 0, + "dependent_xlayers": false, + "doh_constraint": true +} +``` + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `enable` | bool | `true` | Write a Global LCR OBU | +| `purpose_id` | int | 0 | LCR purpose (0=unspecified, 6=multiview, etc.) | +| `dependent_xlayers` | bool | `false` | Signal dependent xlayers | +| `doh_constraint` | bool | `true` | Decode order hint constraint | + +### Local LCR + +Local LCR OBUs provide per-xlayer layer configuration. + +```json +"local_lcr": { + "enable": true, + "mode": "both" +} +``` + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `enable` | bool | `false` | Write Local LCR OBUs | +| `mode` | string | `"both"` | `"both"` = Global + Local with identical xlayer_info; `"local_only"` = Global without payload, Local is authoritative | + +### OPS (Operating Point Set) + +Operating points define subsets of the bitstream that can be +independently decoded. Each operating point specifies which xlayers +(and optionally how many mlayers per xlayer) are included. + +```json +"ops": [ + { + "ops_id": 0, + "priority": 0, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0], "mlayer_count": [1] }, + { "intent": 1, "xlayer_map": [0], "mlayer_count": [3] }, + { "intent": 2, "xlayer_map": [0, 1], "mlayer_count": [3, 1] } + ] + } +] +``` + +| Field | Type | Description | +|-------|------|-------------| +| `ops_id` | int | OPS identifier (0-15) | +| `priority` | int | OPS priority | +| `intent_present` | bool | Signal intent per operating point | +| `ptl_present` | bool | Signal profile/tier/level per operating point | +| `color_info_present` | bool | Signal color info per operating point | +| `mlayer_info_idc` | int | Mlayer info mode (0=none, 1=same, 2=explicit) | +| `operating_points` | array | Array of operating point definitions | + +Each operating point entry: + +| Field | Type | Description | +|-------|------|-------------| +| `intent` | int | Display intent | +| `xlayer_map` | array | List of xlayer IDs included in this OP | +| `mlayer_count` | array | Number of embedded layers per xlayer in this OP (0=all) | + +### Atlas + +Atlas signaling describes how xlayers are spatially composed into a +single canvas. + +```json +"atlas": { + "enable": true, + "mode": 0, + "width": 1920, + "height": 1080, + "uniform_spacing": false +} +``` + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `enable` | bool | `false` | Write Atlas OBU | +| `mode` | int | 0 | Atlas mode | +| `width` | int | 0 | Canvas width (0 = derive from xlayers) | +| `height` | int | 0 | Canvas height (0 = derive) | +| `uniform_spacing` | bool | `true` | Auto-grid (`true`) or explicit positions (`false`) | + +When `uniform_spacing` is `false`, each xlayer must specify `atlas_pos_x` +and `atlas_pos_y`. + +### Input Sources + +Define multiple named input sequences. Each xlayer references one by +name and specifies crop coordinates within that input. The same input +can feed multiple xlayers with different crop regions. + +```json +"inputs": [ + { "name": "texture", "filename": "video.yuv", "width": 1920, "height": 1080 }, + { "name": "alpha", "filename": "alpha.yuv", "width": 1920, "height": 1080, + "format": "yuv420", "bit_depth": 8 } +], +"xlayers": [ + { "xlayer_id": 0, "input_source": "texture", "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0, ... }, + { "xlayer_id": 1, "input_source": "texture", "width": 960, "height": 540, + "atlas_pos_x": 960, "atlas_pos_y": 0, ... }, + { "xlayer_id": 2, "input_source": "alpha", "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0, "layer_type": "auxiliary", + "auxiliary_type": "alpha", ... } +] +``` + +Each input source entry: + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `name` | string | *required* | Unique name to reference from xlayers | +| `filename` | string | *required* | Input file path (Y4M or raw YUV) | +| `width` | int | 0 | Frame width (required for raw YUV, 0 = auto-detect from Y4M) | +| `height` | int | 0 | Frame height | +| `format` | string | `""` | Chroma format: `"yuv420"`, `"yuv422"`, `"yuv444"` (auto-detected for Y4M) | +| `bit_depth` | int | 0 | Input bit depth (0 = auto-detect from Y4M, or default 8 for raw) | +| `frame_rate` | number or string | 0 | Frame rate as a number (e.g. `30`, `29.97`) or rational string (e.g. `"30000/1001"`). 0 = auto-detect from Y4M or use global timebase. | + +**Input resolution priority** (per xlayer, in order): +1. `"input"` — xlayer reads its own file, no shared source +2. `"input_source"` — references a named input from `"inputs"`, uses + `atlas_pos_x/y` as crop origin within that input +3. If neither is set and `"inputs"` has exactly 1 entry, all xlayers + use that input +4. If neither is set and `"inputs"` has multiple entries, validation error + +When an xlayer uses an input source: +- `atlas_pos_x` / `atlas_pos_y` are required (used as the crop origin) +- `width` / `height` are required (used as the crop size) +- All xlayers sharing the same input source must use the same chroma + format (profile) + +**Mixed mode:** Some xlayers can use `input_source` (shared sources) +while others use `input` (own files) in the same config. + +**Backward compatibility:** The old single `"source"` object is +internally converted to `"inputs"` with a single entry named +`"default"`: + +```json +{ "source": { "filename": "v.yuv", "width": 1920, "height": 1080 } } +``` + +is equivalent to: + +```json +{ "inputs": [{ "name": "default", "filename": "v.yuv", "width": 1920, "height": 1080 }] } +``` + +`"inputs"` and `"source"` cannot both be present. + +### Codec Controls + +Generic codec controls allow per-xlayer override of encoder settings +that are normally only accessible via CLI flags. Controls are applied +after encoder initialization via `avm_codec_control()`. + +```json +{ + "xlayer_id": 0, + "codec_controls": [ + ["enable_deblocking", 0], + ["enable_cdef", 0], + ["enable_restoration", 0], + ["enable_tpl_model", 0], + ["enable_keyframe_filtering", 0], + ["enable_global_motion", 0], + ["enable_warped_motion", 0] + ] +} +``` + +Each entry is a `[name, value]` pair where `name` is a string matching +the codec control name and `value` is an integer. Supported control +names map directly to the `AV2E_SET_*` codec control IDs: + +| Control Name | CLI Equivalent | Description | +|-------------|----------------|-------------| +| `enable_deblocking` | `--enable-deblocking` | Deblocking filter | +| `enable_cdef` | `--enable-cdef` | CDEF filter | +| `enable_restoration` | `--enable-restoration` | Loop restoration | +| `enable_tpl_model` | `--enable-tpl-model` | Temporal dependency model | +| `enable_keyframe_filtering` | `--enable-keyframe-filtering` | Keyframe filtering | +| `enable_global_motion` | `--enable-global-motion` | Global motion estimation | +| `enable_warped_motion` | `--enable-warped-motion` | Warped motion compensation | +| `enable_intrabc` | `--enable-intrabc` | Intra block copy | +| `enable_palette` | `--enable-palette` | Palette mode | +| `enable_interintra_comp` | `--enable-interintra-comp` | Inter-intra compound | + +This is particularly useful for creating fast debug configurations +that disable expensive coding tools. See the `*_fast.json` configs +in `cfg/xlayer/` for examples. + +### GOP Modes + +The `gop_mode` field controls the Group of Pictures structure, which +determines how keyframes and reference frames are managed across GOP +boundaries. Three modes are available: + +#### `"closed"` (default) + +Closed GOP: each GOP begins with a Closed Loop Key (CLK) frame that +resets all reference buffers. No inter-prediction is possible across +GOP boundaries. Works with both monotonic and non-monotonic output. + +```json +{ "gop_mode": "closed" } +``` + +Derived settings: `fwd_kf_enabled = 0`, `enable_keyframe_filtering = 0`, +`add_sef_for_hidden_frames = 0`. + +#### `"open_leading"` (non-monotonic only) + +Open GOP with Open Loop Key (OLK) and leading pictures. The forward +keyframe is coded as a KEY_FRAME (OLK OBU) at the GOP boundary. An OLK +can be either **displayed** (implicit output — the decoder reorders it +to the correct display position) or **hidden** (followed by an overlay +or SEF in the same temporal unit). By default in this mode, the OLK is +displayed; setting `enable_keyframe_filtering` to 2 makes it hidden +with a filtered overlay. Frames before the OLK in display order but +after it in coding order are "leading pictures" (LEADING_TILE_GROUP +OBUs). + +This mode requires `lag_in_frames > 0` (for the lookahead needed to +code the forward keyframe) and is incompatible with +`monotonic_output_order: true` (leading OBUs require non-monotonic output). + +```json +{ "gop_mode": "open_leading", "lag_in_frames": 19, "kf_max_dist": 9 } +``` + +**Important constraints:** +- OLK OBUs cannot be in the same temporal unit as leading OBUs. TUs + with leading OBUs contain only leading VCL OBUs. +- The OLK designation is at the **temporal unit level**, not the frame + level. Higher embedded layers in an OLK TU can be inter OBUs. +- `enable_keyframe_filtering` is independent of GOP mode and defaults + to 0. When set to 2, the OLK is hidden and a filtered overlay frame + is produced in the same TU. When 0, the OLK is displayed directly. + +Derived settings: `fwd_kf_enabled = 1`, `enable_keyframe_filtering = 0`, +`add_sef_for_hidden_frames = 0`. + +#### `"open_sef"` (monotonic compatible) + +Open GOP with hidden intra frame and SEF output. When combined with +`monotonic_output_order: true`, the forward keyframe is coded as a +hidden INTRA_ONLY_FRAME instead of KEY_FRAME. This preserves reference +buffers across the GOP boundary (no reset), enabling inter-prediction +from frames before the boundary. The hidden intra frame is later shown +via the Show Existing Frame (SEF) mechanism. + +This mode requires `lag_in_frames > 0` for the lookahead. When +monotonic output is enabled, the `intra_only_fwd_kf` control is +automatically set. + +```json +{ + "gop_mode": "open_sef", + "lag_in_frames": 19, + "kf_max_dist": 9 +} +``` + +Derived settings: `fwd_kf_enabled = 1`, `enable_keyframe_filtering = 0`, +`add_sef_for_hidden_frames = 1`. When `monotonic_output_order` is also +`true`: `intra_only_fwd_kf = 1`. + +**INTRA_ONLY_FRAME vs KEY_FRAME:** An INTRA_ONLY_FRAME is intra-coded +(no inter-prediction within the frame) but does NOT reset reference +buffers, frame number, or reference frame mappings. Subsequent frames +can still reference frames from before the GOP boundary. A KEY_FRAME, +in contrast, resets all reference state, creating a clean random access +point. + +#### Multi-Mlayer Keyframe Management + +For xlayers with multiple embedded layers (mlayers): + +- **With `lag_in_frames = 0`:** The encoder-internal keyframe placement + is disabled (`kf_mode = AVM_KF_DISABLED`) because the encoder's + keyframe counter advances per encode call, not per temporal unit. The + xlayer encode loop manages keyframes externally via `AVM_EFLAG_FORCE_KF` + on independent mlayers (those with `depends_on: []` or `depends_on` + absent and `mlayer_id == 0`). + +- **With `lag_in_frames > 0`:** The encoder uses `multi_layers_lag_test` + which fixes the per-encode-call keyframe counter and enables internal + forward keyframe support for multi-mlayer encoding. This is required + for `gop_mode: "open_leading"` and `gop_mode: "open_sef"` with + multiple embedded layers. + +**Multi-rate encoding:** Input sources can have different frame rates. +The encoder uses the highest frame rate as the master rate and encodes +at that cadence. Lower-rate sources must have frame rates that are exact +integer divisors of the master rate. On temporal units where a source is +not active, its xlayers are skipped. + +Frame rates are stored internally as rational numbers (`num/den`) to +avoid floating-point precision issues. The JSON accepts both numeric +values (e.g. `30`, `29.97`) and rational strings (e.g. `"30000/1001"`). +Common conversions: + +| JSON value | Internal `num/den` | +|------------|-------------------| +| `60` | 60/1 | +| `30` | 30/1 | +| `29.97` | 30000/1001 | +| `23.976` | 24000/1001 | +| `"30000/1001"` | 30000/1001 | + +Example with 60 fps texture and 15 fps depth (depth encodes every 4th TU): + +```json +{ + "inputs": [ + { "name": "texture", "filename": "video.yuv", "width": 1920, "height": 1080, + "frame_rate": 60 }, + { "name": "depth", "filename": "depth.yuv", "width": 1920, "height": 1080, + "frame_rate": 15 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "texture", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, "qp": 128, "cpu_used": 5 }, + { "xlayer_id": 1, "input_source": "depth", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "layer_type": "auxiliary", "auxiliary_type": "depth", + "qp": 160, "cpu_used": 5 } + ], + "output": "multi_rate.obu" +} +``` + +--- + +### Texture + Depth + +Encode a texture layer and a depth map as two independent xlayers: + +```json +{ + "xlayers": [ + { "xlayer_id": 0, "input": "texture.y4m", "width": 1920, "height": 1080, + "layer_type": "texture", "qp": 128, "cpu_used": 5 }, + { "xlayer_id": 1, "input": "depth.y4m", "width": 1920, "height": 1080, + "layer_type": "auxiliary", "auxiliary_type": "depth", + "qp": 160, "cpu_used": 5 } + ], + "global_lcr": { "enable": true, "purpose_id": 0, "doh_constraint": true }, + "ops": [{ + "ops_id": 0, "priority": 0, "intent_present": true, "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0] }, + { "intent": 1, "xlayer_map": [0, 1] } + ] + }], + "output": "texture_depth.obu" +} +``` + +See: `cfg/xlayer/texture_depth_2layer.json` + +### Stereo Video + +Encode left and right views as separate xlayers (simulcast). Each +view is encoded independently — there is no inter-layer prediction +between views. For stereo with inter-layer prediction, see +[Stereo via Embedded Layers](#stereo-via-embedded-layers). + +```json +{ + "xlayers": [ + { "xlayer_id": 0, "input": "left.y4m", "width": 1920, "height": 1080, + "layer_type": "stereo", "view_type": "left", "qp": 128, "cpu_used": 5 }, + { "xlayer_id": 1, "input": "right.y4m", "width": 1920, "height": 1080, + "layer_type": "stereo", "view_type": "right", "qp": 128, "cpu_used": 5 } + ], + "ops": [{ + "ops_id": 0, "priority": 0, "intent_present": true, "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0] }, + { "intent": 1, "xlayer_map": [0, 1] } + ] + }], + "output": "stereo.obu" +} +``` + +See: `cfg/xlayer/stereo_2layer.json` + +### Subpicture Tiling + +Tile a 1920x1080 frame into 4 quadrants from a single input source: + +```json +{ + "inputs": [ + { "name": "default", "filename": "video.yuv", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0, "qp": 128, "cpu_used": 5 }, + { "xlayer_id": 1, "width": 960, "height": 540, + "atlas_pos_x": 960, "atlas_pos_y": 0, "qp": 128, "cpu_used": 5 }, + { "xlayer_id": 2, "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 540, "qp": 128, "cpu_used": 5 }, + { "xlayer_id": 3, "width": 960, "height": 540, + "atlas_pos_x": 960, "atlas_pos_y": 540, "qp": 128, "cpu_used": 5 } + ], + "atlas": { "enable": true, "width": 1920, "height": 1080, + "uniform_spacing": false }, + "ops": [{ + "ops_id": 0, "priority": 0, "intent_present": true, "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0] }, + { "intent": 1, "xlayer_map": [0, 1, 2, 3] } + ] + }], + "output": "subpicture_4q.obu" +} +``` + +See: `cfg/xlayer/subpicture_4quadrant.json` + +### Subpicture with Auxiliary Layers + +Encode texture and alpha from separate source files, each tiled into +subpictures. The texture tiles crop from the texture source, and the +alpha tiles crop from the alpha source: + +```json +{ + "inputs": [ + { "name": "texture", "filename": "video.yuv", "width": 1920, "height": 1080 }, + { "name": "alpha", "filename": "alpha.yuv", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "texture", "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0, "layer_type": "texture", ... }, + { "xlayer_id": 1, "input_source": "texture", "width": 960, "height": 540, + "atlas_pos_x": 960, "atlas_pos_y": 0, "layer_type": "texture", ... }, + { "xlayer_id": 2, "input_source": "alpha", "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "layer_type": "auxiliary", "auxiliary_type": "alpha", ... }, + { "xlayer_id": 3, "input_source": "alpha", "width": 960, "height": 540, + "atlas_pos_x": 960, "atlas_pos_y": 0, + "layer_type": "auxiliary", "auxiliary_type": "alpha", ... } + ], + "output": "subpic_tex_alpha.obu" +} +``` + +See: `cfg/xlayer/subpicture_texture_alpha_4q.json` + +### Spatial Scalability with Embedded Layers + +Encode a texture layer with 3 spatial scales (1/4, 1/2, full) and a +depth layer at full resolution only: + +```json +{ + "xlayers": [ + { "xlayer_id": 0, "input": "texture.y4m", "width": 1920, "height": 1080, + "num_embedded_layers": 3, + "scaling_mode": ["1/4", "1/2", "1:1"], + "layer_type": "texture", "qp": 128, "cpu_used": 9 }, + { "xlayer_id": 1, "input": "depth.y4m", "width": 1920, "height": 1080, + "num_embedded_layers": 1, + "layer_type": "auxiliary", "auxiliary_type": "depth", + "qp": 160, "cpu_used": 9 } + ], + "ops": [{ + "ops_id": 0, "priority": 0, "intent_present": true, "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0], "mlayer_count": [1] }, + { "intent": 1, "xlayer_map": [0], "mlayer_count": [3] }, + { "intent": 2, "xlayer_map": [0, 1], "mlayer_count": [3, 1] } + ] + }], + "output": "scalable_texture_depth.obu" +} +``` + +This produces three operating points: +- OP0: texture at 1/4 resolution (480x270) — lowest bandwidth +- OP1: texture at all 3 scales (480x270, 960x540, 1920x1080) — full quality +- OP2: texture at all scales + depth — complete bitstream + +See: `cfg/xlayer/texture_depth_2layer_3ml.json` + +### Mixed Embedded Layer Counts + +Different xlayers can have different numbers of embedded layers. For +example, a main texture layer could use 3 embedded layers for spatial +scalability while an auxiliary depth layer uses only 1. The constraint +is that output frames within a TU must have matching order hints and +synchronized random access points — NOT that embedded layer counts +match across xlayers. + +### Stereo via Embedded Layers + +Encode left and right views as two embedded layers within a single +xlayer, each reading from a different input source. This allows +inter-layer prediction between views when `depends_on` is set: + +```json +{ + "inputs": [ + { "name": "left", "filename": "left.yuv", "width": 1920, "height": 1080 }, + { "name": "right", "filename": "right.yuv", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "left", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "layer_type": "stereo", "view_type": "left", + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1:1", "input_source": "left", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "right", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [0] } + ], + "qp": 128, "cpu_used": 5 } + ], + "ops": [{ + "ops_id": 0, "priority": 0, "intent_present": true, "ptl_present": true, + "mlayer_info_idc": 2, + "operating_points": [ + { "intent": 0, "xlayer_map": [0], "mlayer_count": [1] }, + { "intent": 1, "xlayer_map": [0], "mlayer_count": [2] } + ] + }], + "output": "stereo_embedded.obu" +} +``` + +This produces two operating points: +- OP0: left view only (mlayer 0) +- OP1: both views (mlayers 0 and 1) + +See: `cfg/xlayer/stereo_embedded_2ml.json` + +### Subpicture Tiling via Embedded Layers + +Tile a 1920x1080 frame into 4 quadrants using 4 embedded layers within +a single xlayer, each cropping from a different region of the same +input source. This avoids needing 4 separate xlayers: + +```json +{ + "inputs": [ + { "name": "video", "filename": "video.yuv", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "video", "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "num_embedded_layers": 4, + "embedded_layers": [ + { "scaling_mode": "1:1", "input_source": "video", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 960, "height": 540, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "video", + "atlas_pos_x": 960, "atlas_pos_y": 0, "width": 960, "height": 540, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "video", + "atlas_pos_x": 0, "atlas_pos_y": 540, "width": 960, "height": 540, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "video", + "atlas_pos_x": 960, "atlas_pos_y": 540, "width": 960, "height": 540, + "depends_on": [] } + ], + "qp": 128, "cpu_used": 5 } + ], + "output": "subpic_embedded.obu" +} +``` + +Note `depends_on: []` on each mlayer — the quadrants are spatially +independent so inter-layer prediction is disabled. + +See: `cfg/xlayer/subpicture_embedded_4q.json` + +### Texture + Depth via Embedded Layers with XLayers + +Combine xlayers and per-mlayer embedded layers. One xlayer uses 3 +embedded layers for spatial scalability (1/4, 1/2, full), while a +second xlayer has 2 embedded layers reading from texture and depth +sources separately: + +```json +{ + "inputs": [ + { "name": "texture", "filename": "texture.yuv", "width": 1920, "height": 1080 }, + { "name": "depth", "filename": "depth.yuv", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "texture", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "layer_type": "texture", + "num_embedded_layers": 3, + "embedded_layers": [ + { "scaling_mode": "1/4" }, + { "scaling_mode": "1/2" }, + { "scaling_mode": "1:1" } + ], + "qp": 128, "cpu_used": 5 }, + { "xlayer_id": 1, "input_source": "texture", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "layer_type": "auxiliary", "auxiliary_type": "depth", + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", "input_source": "texture", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "depth", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [] } + ], + "qp": 160, "cpu_used": 5 } + ], + "ops": [{ + "ops_id": 0, "priority": 0, "intent_present": true, "ptl_present": true, + "mlayer_info_idc": 2, + "operating_points": [ + { "intent": 0, "xlayer_map": [0], "mlayer_count": [1] }, + { "intent": 1, "xlayer_map": [0], "mlayer_count": [3] }, + { "intent": 2, "xlayer_map": [0, 1], "mlayer_count": [3, 2] } + ] + }], + "output": "texture_depth_embedded.obu" +} +``` + +This produces three operating points: +- OP0: texture at 1/4 resolution +- OP1: texture at all 3 scales +- OP2: texture at all scales + depth via independent embedded layers + +See: `cfg/xlayer/texture_depth_embedded_3ml_2ml.json` + +--- + +## GOP Mode and Output Order + +The `gop_mode` and `monotonic_output_order` settings interact to control +how keyframes, reference frames, and hidden frames are managed. This +section covers the valid combinations with multi-xlayer + multi-mlayer +examples. + +### Compatibility Matrix + +| GOP Mode | Non-Monotonic (`false`) | Monotonic (`true`) | +|----------|:-----------------------:|:------------------:| +| `closed` | Yes | Yes | +| `open_leading` | Yes | **No** | +| `open_sef` | Yes* | Yes | + +\* `open_sef` with non-monotonic is valid but uses KEY_FRAME (not +INTRA_ONLY_FRAME) as the forward keyframe, which resets reference +buffers. With monotonic output, `open_sef` uses INTRA_ONLY_FRAME to +preserve references across the GOP boundary. + +**Key differences:** + +- **Non-monotonic**: ARF and INTNL_ARF frames are implicit output + frames (the decoder reorders them to display order). No SEF OBUs are + needed for these. Zero overhead. +- **Monotonic**: ARF and INTNL_ARF frames are genuinely hidden. SEF + OBUs are inserted at the correct monotonic position to display them. + SEFs have zero coding cost. +- **`open_leading`**: The forward keyframe is an OLK. By default it is + **displayed** (implicit output), but it can be made hidden with + `enable_keyframe_filtering = 2` (producing a filtered overlay in the + same TU). The OLK designation is at the TU level; higher embedded + layers in the OLK TU can be inter OBUs. +- **`open_sef` + monotonic**: The forward keyframe is a **hidden + INTRA_ONLY_FRAME** that does NOT reset reference buffers. Inter- + prediction across the GOP boundary is possible. + +### Closed GOP, Non-Monotonic (Multi-XLayer + Multi-MLayer) + +Each GOP begins with a CLK that resets all reference buffers. ARF and +INTNL_ARF frames are implicit output (decoder reorders). This is the +simplest and most robust configuration. + +```json +{ + "xlayers": [ + { "xlayer_id": 0, "input": "texture.y4m", "width": 1920, "height": 1080, + "num_embedded_layers": 2, "scaling_mode": ["1/2", "1:1"], + "qp": 128, "cpu_used": 5, "lag_in_frames": 19, + "gop_mode": "closed" }, + { "xlayer_id": 1, "input": "depth.y4m", "width": 1920, "height": 1080, + "num_embedded_layers": 1, + "layer_type": "auxiliary", "auxiliary_type": "depth", + "qp": 160, "cpu_used": 5, "lag_in_frames": 19, + "gop_mode": "closed" } + ], + "monotonic_output_order": false, + "output": "closed_nonmono.obu" +} +``` + +See: `cfg/xlayer/texture_depth_2xl_2ml_closed_nonmono.json` + +### Closed GOP, Monotonic (Multi-XLayer + Multi-MLayer) + +Same as above but with monotonic output. Hidden frames (ARF, INTNL_ARF) +are output via SEF at the correct display position. This is required +when the application needs frames in strict display order (e.g., +low-delay playback without reordering). + +```json +{ + "xlayers": [ + { "xlayer_id": 0, "input": "texture.y4m", "width": 1920, "height": 1080, + "num_embedded_layers": 2, "scaling_mode": ["1/2", "1:1"], + "qp": 128, "cpu_used": 5, "lag_in_frames": 19, + "gop_mode": "closed" }, + { "xlayer_id": 1, "input": "depth.y4m", "width": 1920, "height": 1080, + "num_embedded_layers": 1, + "layer_type": "auxiliary", "auxiliary_type": "depth", + "qp": 160, "cpu_used": 5, "lag_in_frames": 19, + "gop_mode": "closed" } + ], + "monotonic_output_order": true, + "output": "closed_mono.obu" +} +``` + +See: `cfg/xlayer/texture_depth_2xl_2ml_closed_mono.json` + +### Open Leading, Non-Monotonic (Multi-XLayer + Multi-MLayer) + +The forward keyframe is an OLK at each GOP boundary. By default it is +displayed (implicit output), but `enable_keyframe_filtering` can make +it hidden with a filtered overlay. Frames before the OLK in display +order are coded as leading pictures after the OLK in coding order. The +OLK allows random access while preserving some coding efficiency +through leading-picture prediction. + +Requires `lag_in_frames > 0` and `monotonic_output_order: false`. + +```json +{ + "xlayers": [ + { "xlayer_id": 0, "input": "texture.y4m", "width": 1920, "height": 1080, + "num_embedded_layers": 2, "scaling_mode": ["1/2", "1:1"], + "qp": 128, "cpu_used": 5, + "lag_in_frames": 19, "kf_max_dist": 9, + "gop_mode": "open_leading" }, + { "xlayer_id": 1, "input": "depth.y4m", "width": 1920, "height": 1080, + "num_embedded_layers": 1, + "layer_type": "auxiliary", "auxiliary_type": "depth", + "qp": 160, "cpu_used": 5, + "lag_in_frames": 19, "kf_max_dist": 9, + "gop_mode": "open_leading" } + ], + "monotonic_output_order": false, + "output": "open_leading_nonmono.obu" +} +``` + +See: `cfg/xlayer/texture_depth_2xl_2ml_open_leading.json` + +### Open SEF, Monotonic (Multi-XLayer + Multi-MLayer) + +The forward keyframe is a hidden INTRA_ONLY_FRAME that does not reset +reference buffers. Inter-prediction from frames before the GOP boundary +is preserved. The hidden frame is output via SEF in monotonic display +order. This gives the best coding efficiency at GOP boundaries while +maintaining strict display-order output. + +Requires `lag_in_frames > 0`. + +```json +{ + "xlayers": [ + { "xlayer_id": 0, "input": "texture.y4m", "width": 1920, "height": 1080, + "num_embedded_layers": 2, "scaling_mode": ["1/2", "1:1"], + "qp": 128, "cpu_used": 5, + "lag_in_frames": 19, "kf_max_dist": 9, + "gop_mode": "open_sef" }, + { "xlayer_id": 1, "input": "depth.y4m", "width": 1920, "height": 1080, + "num_embedded_layers": 1, + "layer_type": "auxiliary", "auxiliary_type": "depth", + "qp": 160, "cpu_used": 5, + "lag_in_frames": 19, "kf_max_dist": 9, + "gop_mode": "open_sef" } + ], + "monotonic_output_order": true, + "output": "open_sef_mono.obu" +} +``` + +See: `cfg/xlayer/texture_depth_2xl_2ml_open_sef_mono.json` + +--- + +## Decoding + +### Basic multi-layer decode + +```bash +# Decode all layers into a single interleaved y4m +avmdec --all-layers -o decoded.y4m combined.obu + +# Decode all layers into separate per-stream files +avmdec --all-layers --num-streams=2 -o decoded_%d.y4m combined.obu +``` + +### Atlas composite decode + +Reconstruct the original composite canvas from subpicture tiles: + +```bash +avmdec --all-layers --atlas-composite \ + --xlayer-config subpicture_4quadrant.json \ + -o composite.y4m subpicture_4q.obu +``` + +This reads the atlas layout from the JSON config and composites each +decoded xlayer back into its position on the canvas. + +--- + +## Stream Demuxing + +The `stream_demuxer` tool (built alongside `avmenc` and `avmdec`) can +extract individual xlayer bitstreams from a combined multi-xlayer OBU +file: + +```bash +stream_demuxer input.obu output_prefix +``` + +This produces separate `.obu` files for each xlayer discovered in the +Global LCR: `output_prefix_0.obu`, `output_prefix_1.obu`, etc. Each +extracted stream can be decoded independently with the standard decoder. + +--- + +## Constraints and Validation + +The JSON config is validated before encoding. The following constraints +are enforced: + +1. **xlayer_id** must be unique and in range 0-30. +2. Each xlayer must have an `input` file, an `input_source` reference, + or a single default `inputs` entry must be configured. +3. **Input source names** must be unique and non-empty. +4. `"inputs"` and `"source"` cannot both be present. +5. When multiple `inputs` are defined, each xlayer without its own + `input` file must have an explicit `input_source`. +6. **num_embedded_layers** must be 1-8. +7. When `num_embedded_layers > 1`: + - The last entry in `scaling_mode` must be `"1:1"` (full resolution). + - All scaling mode values must be valid (0-6). +8. **Input source** mode requires `atlas_pos_x`, `atlas_pos_y`, + `width`, and `height` for every xlayer using that source. Xlayers + sharing the same input source must use the same chroma format. +9. **OPS** operating points may only reference xlayer IDs that exist + in the config. +10. When **`monotonic_output_order` is `false`**, all xlayers must use + the same coding structure: `num_temporal_layers`, `lag_in_frames`, + `kf_max_dist`, `subgop_config`, and `gop_mode`. Different + `num_embedded_layers` is allowed. +11. **`gop_mode: "open_leading"`** is not allowed when + `monotonic_output_order` is `true` (leading OBUs require + non-monotonic output). +12. **Input source frame rates** must be exact integer divisors of the + highest frame rate among all input sources (e.g. 60/30/15 is valid, + but 30/24 is not). +13. **`embedded_layers`** and the flat `scaling_mode` array are mutually + exclusive on the same xlayer entry. +14. **`embedded_layers`** array length must match `num_embedded_layers`. +15. Per-mlayer **`input_source`** requires `width`, `height`, + `atlas_pos_x`, and `atlas_pos_y`. +16. **`depends_on`** entries must reference mlayer indices strictly less + than the current mlayer index. mlayer 0 cannot depend on anything. +17. **CLK/OLK alignment:** When a CLK (Closed Layer Key) OBU appears + in a temporal unit, the first embedded layer (mlayer 0) and all + independent embedded layers (those with `depends_on: []`) must + also have CLK OBUs. The same rule applies to OLK (Open Layer Key) + OBUs. The encoder enforces this automatically. +18. **Monotonic output order and hidden frames:** When + `monotonic_output_order` is `true`, implicit output frames are not + allowed. All hidden frames (ARFs, forward keyframes) must be output + via SEF (Show Existing Frame) instead. The encoder automatically + enables `add_sef_for_hidden_frames` when monotonic output is + requested. This precludes `gop_mode: "open_leading"` (which uses + implicit output for OLK overlays and leading frames). +19. **Open GOP with monotonic output:** When `gop_mode: "open_sef"` and + `monotonic_output_order` is `true`, the forward keyframe is coded as + INTRA_ONLY_FRAME (not KEY_FRAME). This preserves reference buffers + across the GOP boundary, enabling inter-prediction from pre-boundary + frames. The hidden intra frame is later shown via SEF. + +--- + +## Reference Configs + +The `cfg/xlayer/` directory contains ready-to-use configuration files: + +| Config | Description | +|--------|-------------| +| `texture_depth_2layer.json` | Texture + depth, 2 xlayers | +| `texture_depth_2layer_3ml.json` | Texture (3 embedded layers) + depth | +| `texture_depth_2layer_clk.json` | Texture + depth, closed GOP | +| `texture_depth_2layer_open_leading.json` | Texture + depth, open leading GOP (1 mlayer each) | +| `texture_depth_2layer_open_sef.json` | Texture + depth, open SEF GOP (1 mlayer each) | +| `texture_depth_2xl_2ml_closed_nonmono.json` | **2 xlayers × 2 mlayers, closed GOP, non-monotonic** | +| `texture_depth_2xl_2ml_closed_mono.json` | **2 xlayers × 2 mlayers, closed GOP, monotonic** | +| `texture_depth_2xl_2ml_open_leading.json` | **2 xlayers × 2 mlayers, open leading, non-monotonic** | +| `texture_depth_2xl_2ml_open_sef_mono.json` | **2 xlayers × 2 mlayers, open SEF, monotonic** | +| `texture_depth_2layer_local_only.json` | Texture + depth, local-only LCR | +| `texture_depth_2layer_fast.json` | Texture + depth, fast debug settings (coding tools disabled) | +| `texture_2mlayer_fast.json` | Single xlayer with 2 embedded layers, fast debug settings | +| `texture_alpha_depth_3layer.json` | Texture + alpha + depth, 3 xlayers | +| `stereo_2layer.json` | Stereo simulcast: left + right as separate xlayers (no inter-layer prediction) | +| `subpicture_3region.json` | 3-region subpicture tiling | +| `subpicture_4quadrant.json` | 4-quadrant subpicture tiling (single input source) | +| `subpicture_texture_alpha_4q.json` | 4-quadrant with separate texture + alpha input sources | +| `annexG2_360degree_9xlayer.json` | 360-degree video, 9 subpictures with 3 embedded layers each | +| `annexG3_videoconf_3xlayer.json` | Video conferencing, 3 participants | +| `annexG4_roi_scalable_2xlayer.json` | ROI scalable, base + enhancement | +| `stereo_embedded_2ml.json` | Stereo views via 2 embedded layers with inter-layer prediction | +| `subpicture_embedded_4q.json` | 4-quadrant subpicture via 4 embedded layers | +| `texture_depth_embedded_3ml_2ml.json` | Texture (3 mlayers) + depth via embedded layers with xlayers | diff --git a/test/test.cmake b/test/test.cmake index 49c61873c7..5be36c8f1d 100644 --- a/test/test.cmake +++ b/test/test.cmake @@ -45,7 +45,11 @@ list( "${AVM_ROOT}/test/test_vectors.h" "${AVM_ROOT}/test/transform_test_base.h" "${AVM_ROOT}/test/util.h" - "${AVM_ROOT}/test/video_source.h") + "${AVM_ROOT}/test/video_source.h" + "${AVM_ROOT}/test/xlayer_config_test.cc" + "${AVM_ROOT}/test/tu_assembler_test.cc" + "${AVM_ROOT}/common/xlayer_config_parse.c" + "${AVM_ROOT}/common/tu_assembler.c") list( APPEND @@ -358,6 +362,7 @@ function(setup_avm_test_targets) endif() endif() + target_sources(test_libavm PRIVATE $) target_link_libraries(test_libavm ${AVM_LIB_LINK_TYPE} avm avm_gtest) if(CONFIG_LIBYUV) diff --git a/test/tu_assembler_test.cc b/test/tu_assembler_test.cc new file mode 100644 index 0000000000..21a81d4673 --- /dev/null +++ b/test/tu_assembler_test.cc @@ -0,0 +1,844 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#include + +#include + +#include "avm/avm_integer.h" +#include "av2/common/enums.h" +#include "common/tu_assembler.h" +#include "common/xlayer_config.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +// Helper: build a minimal MultiXLayerConfig for testing +static void MakeMinimalConfig(MultiXLayerConfig *cfg, int num_xlayers, + const int *xlayer_ids) { + xlayer_config_init(cfg); + cfg->num_xlayers = num_xlayers; + for (int i = 0; i < num_xlayers; i++) { + cfg->xlayers[i].xlayer_id = xlayer_ids[i]; + snprintf(cfg->xlayers[i].input_filename, PATH_MAX, "input_%d", i); + cfg->xlayers[i].width = 1920; + cfg->xlayers[i].height = 1080; + } + cfg->enable_global_lcr = 1; + snprintf(cfg->output_filename, PATH_MAX, "test_out.obu"); +} + +// Helper: parse an OBU at the given offset, return header info. +// Returns the total consumed bytes (length_field + obu_total_size). +static int ParseObuAt(const uint8_t *buf, size_t buf_size, size_t offset, + int *out_type, int *out_ext_flag, int *out_xlayer_id, + size_t *out_payload_size) { + if (offset >= buf_size) return -1; + + uint64_t obu_total_size = 0; + size_t length_field_size = 0; + if (avm_uleb_decode(buf + offset, buf_size - offset, &obu_total_size, + &length_field_size) != 0) { + return -1; + } + + if (obu_total_size == 0) return -1; + + const uint8_t *hdr = buf + offset + length_field_size; + *out_ext_flag = (hdr[0] >> 7) & 1; + *out_type = (hdr[0] >> 2) & 0x1F; + + int hdr_size = 1; + *out_xlayer_id = 0; + if (*out_ext_flag) { + *out_xlayer_id = hdr[1] & 0x1F; + hdr_size = 2; + } + + *out_payload_size = (size_t)obu_total_size - hdr_size; + + return (int)(length_field_size + (size_t)obu_total_size); +} + +// --- Init / Free Tests --- + +TEST(TUAssembler, InitAndFree) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + EXPECT_NE(ta.buffer, nullptr); + EXPECT_EQ(ta.size, 0u); + EXPECT_GE(ta.capacity, (size_t)TU_ASM_INITIAL_CAPACITY); + EXPECT_EQ(ta.num_xlayers, 2); + EXPECT_EQ(ta.xlayer_ids[0], 0); + EXPECT_EQ(ta.xlayer_ids[1], 1); + + tu_assembler_free(&ta); + EXPECT_EQ(ta.buffer, nullptr); + EXPECT_EQ(ta.size, 0u); +} + +// --- TD Write Test --- + +TEST(TUAssembler, WriteTD) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + ASSERT_EQ(tu_assembler_write_td(&ta), 0); + + // TD should be 2 bytes: [size=1][header_byte=0x08] + ASSERT_EQ(ta.size, 2u); + EXPECT_EQ(ta.buffer[0], 1); // ULEB128 size = 1 + EXPECT_EQ(ta.buffer[1], 0x08); // OBU_TEMPORAL_DELIMITER << 2 + + // Parse it back + int type, ext_flag, xlayer_id; + size_t payload_size; + int consumed = ParseObuAt(ta.buffer, ta.size, 0, &type, &ext_flag, &xlayer_id, + &payload_size); + EXPECT_EQ(consumed, 2); + EXPECT_EQ(type, OBU_TEMPORAL_DELIMITER); + EXPECT_EQ(ext_flag, 0); + EXPECT_EQ(payload_size, 0u); + + tu_assembler_free(&ta); +} + +// --- OBU Header Rewriting Test --- + +TEST(TUAssembler, AppendXLayerObusRewritesHeaders) { + int ids[] = { 0, 3 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + + // Construct a fake per-xlayer OBU: a Sequence Header with no extension. + // Format: [uleb128 size][header_byte][payload...] + // OBU_SEQUENCE_HEADER = 1, header byte: ext=0, type=1, tlayer=0 => (1<<2)=4 + const uint8_t fake_payload[] = { 0xAA, 0xBB, 0xCC }; + uint8_t input_obu[16]; + size_t input_size = 0; + + // ULEB128 size = 1 (header) + 3 (payload) = 4 + uint8_t size_buf[4]; + size_t size_len = 0; + avm_uleb_encode(4, sizeof(size_buf), size_buf, &size_len); + memcpy(input_obu + input_size, size_buf, size_len); + input_size += size_len; + + // Header byte: ext=0, type=OBU_SEQUENCE_HEADER(1), tlayer=0 + input_obu[input_size++] = (uint8_t)((1 << 2)); // 0x04 + + // Payload + memcpy(input_obu + input_size, fake_payload, sizeof(fake_payload)); + input_size += sizeof(fake_payload); + + // Append with xlayer_id = 3 + ASSERT_EQ(tu_assembler_append_xlayer_obus(&ta, 3, input_obu, input_size), 0); + + // Parse the rewritten OBU + int type, ext_flag, xlayer_id; + size_t payload_size; + int consumed = ParseObuAt(ta.buffer, ta.size, 0, &type, &ext_flag, &xlayer_id, + &payload_size); + ASSERT_GT(consumed, 0); + EXPECT_EQ(type, OBU_SEQUENCE_HEADER); + EXPECT_EQ(ext_flag, 1); // Should now have extension + EXPECT_EQ(xlayer_id, 3); // Rewritten xlayer_id + EXPECT_EQ(payload_size, 3u); // Original payload preserved + + // Verify payload content + size_t hdr_offset = 0; + uint64_t obu_total = 0; + size_t lfs = 0; + avm_uleb_decode(ta.buffer, ta.size, &obu_total, &lfs); + hdr_offset = lfs + 2; // Skip size field and 2-byte header + EXPECT_EQ(ta.buffer[hdr_offset], 0xAA); + EXPECT_EQ(ta.buffer[hdr_offset + 1], 0xBB); + EXPECT_EQ(ta.buffer[hdr_offset + 2], 0xCC); + + tu_assembler_free(&ta); +} + +TEST(TUAssembler, AppendXLayerObusSkipsTD) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + + // Construct a TD OBU (should be skipped by append) + // OBU_TEMPORAL_DELIMITER = 2, header: ext=0, type=2, tlayer=0 => 0x08 + uint8_t td_obu[] = { 1, 0x08 }; // size=1, header + + ASSERT_EQ(tu_assembler_append_xlayer_obus(&ta, 0, td_obu, sizeof(td_obu)), 0); + + // No output — TD should be filtered + EXPECT_EQ(ta.size, 0u); + + tu_assembler_free(&ta); +} + +TEST(TUAssembler, AppendXLayerObusPreservesExtension) { + int ids[] = { 0, 2 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + + // Construct an OBU that already has an extension byte + // OBU_MULTI_FRAME_HEADER = 3 + // Header: ext=1, type=3, tlayer=1 => (1<<7)|(3<<2)|1 = 0x8D + // Extension: mlayer=2, xlayer=0 => (2<<5)|0 = 0x40 + const uint8_t payload[] = { 0xDE, 0xAD }; + uint8_t input_obu[16]; + size_t input_size = 0; + + // ULEB128 size = 2 (header+ext) + 2 (payload) = 4 + uint8_t size_buf[4]; + size_t size_len = 0; + avm_uleb_encode(4, sizeof(size_buf), size_buf, &size_len); + memcpy(input_obu + input_size, size_buf, size_len); + input_size += size_len; + + input_obu[input_size++] = 0x8D; // Header: ext=1, type=3, tlayer=1 + input_obu[input_size++] = 0x40; // Extension: mlayer=2, xlayer=0 + memcpy(input_obu + input_size, payload, sizeof(payload)); + input_size += sizeof(payload); + + // Append with xlayer_id = 2 + ASSERT_EQ(tu_assembler_append_xlayer_obus(&ta, 2, input_obu, input_size), 0); + + // Parse rewritten OBU + int type, ext_flag, xlayer_id; + size_t payload_size; + int consumed = ParseObuAt(ta.buffer, ta.size, 0, &type, &ext_flag, &xlayer_id, + &payload_size); + ASSERT_GT(consumed, 0); + EXPECT_EQ(type, OBU_MULTI_FRAME_HEADER); + EXPECT_EQ(ext_flag, 1); + EXPECT_EQ(xlayer_id, 2); // Rewritten to target xlayer + EXPECT_EQ(payload_size, 2u); // Payload size unchanged + + // Verify mlayer_id is preserved in extension byte + uint64_t obu_total = 0; + size_t lfs = 0; + avm_uleb_decode(ta.buffer, ta.size, &obu_total, &lfs); + uint8_t ext_byte = ta.buffer[lfs + 1]; + int mlayer_id = (ext_byte >> 5) & 0x7; + EXPECT_EQ(mlayer_id, 2); // mlayer preserved + + tu_assembler_free(&ta); +} + +// --- Multiple OBU Append Test --- + +TEST(TUAssembler, AppendMultipleObus) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + + // Build a packet with: TD + SH + Frame data + uint8_t packet[64]; + size_t pkt_size = 0; + + // OBU 1: TD (should be skipped) + packet[pkt_size++] = 1; // size=1 + packet[pkt_size++] = 0x08; // TD header + + // OBU 2: Sequence Header (type=1), 2 bytes payload + uint8_t sb[4]; + size_t sl = 0; + avm_uleb_encode(3, sizeof(sb), sb, &sl); // size = 1 hdr + 2 payload + memcpy(packet + pkt_size, sb, sl); + pkt_size += sl; + packet[pkt_size++] = 0x04; // SH header, no ext + packet[pkt_size++] = 0x11; // payload byte 1 + packet[pkt_size++] = 0x22; // payload byte 2 + + // OBU 3: Leading Tile Group (type=6), 3 bytes payload + avm_uleb_encode(4, sizeof(sb), sb, &sl); // size = 1 hdr + 3 payload + memcpy(packet + pkt_size, sb, sl); + pkt_size += sl; + packet[pkt_size++] = (uint8_t)(6 << 2); // Leading TG header, no ext + packet[pkt_size++] = 0x33; + packet[pkt_size++] = 0x44; + packet[pkt_size++] = 0x55; + + ASSERT_EQ(tu_assembler_append_xlayer_obus(&ta, 1, packet, pkt_size), 0); + + // Should have 2 OBUs output (TD skipped) + size_t offset = 0; + int type, ext_flag, xlayer_id; + size_t payload_size; + + // First OBU: Sequence Header + int consumed = ParseObuAt(ta.buffer, ta.size, offset, &type, &ext_flag, + &xlayer_id, &payload_size); + ASSERT_GT(consumed, 0); + EXPECT_EQ(type, OBU_SEQUENCE_HEADER); + EXPECT_EQ(xlayer_id, 1); + EXPECT_EQ(payload_size, 2u); + offset += consumed; + + // Second OBU: Leading Tile Group + consumed = ParseObuAt(ta.buffer, ta.size, offset, &type, &ext_flag, + &xlayer_id, &payload_size); + ASSERT_GT(consumed, 0); + EXPECT_EQ(type, OBU_LEADING_TILE_GROUP); + EXPECT_EQ(xlayer_id, 1); + EXPECT_EQ(payload_size, 3u); + offset += consumed; + + // Should have consumed all output + EXPECT_EQ(offset, ta.size); + + tu_assembler_free(&ta); +} + +// --- Flush Test --- + +TEST(TUAssembler, FlushWritesToFile) { + int ids[] = { 0 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 1, ids); + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + ASSERT_EQ(tu_assembler_write_td(&ta), 0); + + std::string path_str = testing::TempDir() + "tu_asm_flush_test.obu"; + const char *path = path_str.c_str(); + FILE *f = fopen(path, "wb"); + ASSERT_NE(f, nullptr); + + size_t pre_flush_size = ta.size; + ASSERT_EQ(tu_assembler_flush(&ta, f), 0); + fclose(f); + + // Buffer should be reset + EXPECT_EQ(ta.size, 0u); + + // Verify file contents + f = fopen(path, "rb"); + ASSERT_NE(f, nullptr); + fseek(f, 0, SEEK_END); + long file_size = ftell(f); + fclose(f); + EXPECT_EQ((size_t)file_size, pre_flush_size); + + tu_assembler_free(&ta); +} + +// --- Global LCR Population Test --- + +TEST(TUAssembler, PopulateGlobalLcr) { + int ids[] = { 0, 5 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + cfg.lcr_purpose_id = 2; + cfg.lcr_dependent_xlayers_flag = 1; + cfg.lcr_doh_constraint_flag = 1; + cfg.xlayers[0].layer_type = TEXTURE_LAYER; + cfg.xlayers[1].layer_type = AUX_LAYER; + cfg.xlayers[1].auxiliary_type = LCR_DEPTH_AUX; + + GlobalLayerConfigurationRecord glcr; + populate_global_lcr_from_config(&cfg, &glcr); + + EXPECT_EQ(glcr.LcrMaxNumXLayerCount, 2); + EXPECT_EQ(glcr.LcrXLayerID[0], 0); + EXPECT_EQ(glcr.LcrXLayerID[1], 5); + + // xlayer_map should have bits 0 and 5 set + uint32_t expected_map = (1u << 0) | (1u << 5); + EXPECT_EQ((uint32_t)glcr.lcr_xlayer_map, expected_map); + + EXPECT_EQ(glcr.lcr_global_purpose_id, 2); + EXPECT_EQ(glcr.lcr_dependent_xlayers_flag, 1); + EXPECT_EQ(glcr.lcr_doh_constraint_flag, 1); + EXPECT_EQ(glcr.lcr_global_payload_present_flag, 1); + + // Per-xlayer info: xlayer 0 + EXPECT_EQ(glcr.xlayer_info[0].lcr_rep_info_present_flag, 1); + EXPECT_EQ(glcr.xlayer_info[0].rep_params.lcr_max_pic_width, 1920); + EXPECT_EQ(glcr.xlayer_info[0].rep_params.lcr_max_pic_height, 1080); + + // Per-xlayer info: xlayer 5 is at positional index 1 + EXPECT_EQ(glcr.xlayer_info[1].lcr_rep_info_present_flag, 1); + EXPECT_EQ(glcr.xlayer_info[1].rep_params.lcr_max_pic_width, 1920); + EXPECT_EQ(glcr.xlayer_info[1].rep_params.lcr_max_pic_height, 1080); + + // Embedded layer type info + EXPECT_EQ(glcr.xlayer_info[0].mlayer_params.lcr_layer_type[0], TEXTURE_LAYER); + EXPECT_EQ(glcr.xlayer_info[1].mlayer_params.lcr_layer_type[0], AUX_LAYER); + EXPECT_EQ(glcr.xlayer_info[1].mlayer_params.lcr_auxiliary_type[0], + LCR_DEPTH_AUX); +} + +// --- OPS Population Test --- + +TEST(TUAssembler, PopulateOps) { + OPSConfig ops_cfg; + memset(&ops_cfg, 0, sizeof(ops_cfg)); + ops_cfg.enable = 1; + ops_cfg.ops_id = 0; + ops_cfg.priority = 1; + ops_cfg.intent_present_flag = 1; + ops_cfg.ptl_present_flag = 1; + ops_cfg.num_operating_points = 2; + + // OP0: xlayer 0 only + ops_cfg.ops[0].intent = 0; + ops_cfg.ops[0].xlayer_map = (1u << 0); + + // OP1: xlayers 0 and 3 + ops_cfg.ops[1].intent = 1; + ops_cfg.ops[1].xlayer_map = (1u << 0) | (1u << 3); + + // Set up a minimal MultiXLayerConfig for derivation + int ids[] = { 0, 3 }; + MultiXLayerConfig mcfg; + MakeMinimalConfig(&mcfg, 2, ids); + mcfg.xlayers[0].width = 960; + mcfg.xlayers[0].height = 540; + mcfg.xlayers[0].level = SEQ_LEVEL_4_0; + mcfg.xlayers[1].width = 960; + mcfg.xlayers[1].height = 540; + mcfg.xlayers[1].level = SEQ_LEVEL_4_0; + + OperatingPointSet ops; + populate_ops_from_config(&ops_cfg, GLOBAL_XLAYER_ID, &mcfg, &ops); + + EXPECT_EQ(ops.valid, 1); + EXPECT_EQ(ops.ops_id, 0); + EXPECT_EQ(ops.ops_cnt, 2); + EXPECT_EQ(ops.ops_priority, 1); + EXPECT_EQ(ops.ops_intent_present_flag, 1); + + // OP0: single xlayer + EXPECT_EQ(ops.op[0].ops_intent_op, 0); + EXPECT_EQ(ops.op[0].ops_xlayer_map, 1); + EXPECT_EQ(ops.op[0].XCount, 1); + EXPECT_EQ(ops.op[0].OpsxLayerID[0], 0); + + // OP1: two xlayers + EXPECT_EQ(ops.op[1].ops_intent_op, 1); + EXPECT_EQ(ops.op[1].ops_xlayer_map, (int)((1u << 0) | (1u << 3))); + EXPECT_EQ(ops.op[1].XCount, 2); + EXPECT_EQ(ops.op[1].OpsxLayerID[0], 0); + EXPECT_EQ(ops.op[1].OpsxLayerID[1], 3); +} + +// --- Global LCR OBU Write Test --- + +TEST(TUAssembler, WriteGlobalLcrObu) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + ASSERT_EQ(tu_assembler_write_global_lcr(&ta), 0); + + // Should have produced some output + EXPECT_GT(ta.size, 0u); + + // Parse the OBU header + int type, ext_flag, xlayer_id; + size_t payload_size; + int consumed = ParseObuAt(ta.buffer, ta.size, 0, &type, &ext_flag, &xlayer_id, + &payload_size); + ASSERT_GT(consumed, 0); + EXPECT_EQ(type, OBU_LAYER_CONFIGURATION_RECORD); + EXPECT_EQ(ext_flag, 1); + EXPECT_EQ(xlayer_id, GLOBAL_XLAYER_ID); + + tu_assembler_free(&ta); +} + +// --- MSDO OBU Write Test --- + +TEST(TUAssembler, WriteMsdoObu) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + cfg.enable_msdo = 1; + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + ASSERT_EQ(tu_assembler_write_msdo(&ta), 0); + + EXPECT_GT(ta.size, 0u); + + int type, ext_flag, xlayer_id; + size_t payload_size; + int consumed = ParseObuAt(ta.buffer, ta.size, 0, &type, &ext_flag, &xlayer_id, + &payload_size); + ASSERT_GT(consumed, 0); + EXPECT_EQ(type, OBU_MULTI_STREAM_DECODER_OPERATION); + EXPECT_EQ(ext_flag, 1); + EXPECT_EQ(xlayer_id, GLOBAL_XLAYER_ID); + + tu_assembler_free(&ta); +} + +TEST(TUAssembler, MsdoSkippedWhenDisabled) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + cfg.enable_msdo = 0; + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + ASSERT_EQ(tu_assembler_write_msdo(&ta), 0); + + // Should produce no output when disabled + EXPECT_EQ(ta.size, 0u); + + tu_assembler_free(&ta); +} + +// --- Full TU Assembly Test --- + +TEST(TUAssembler, FullTuAssembly) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + cfg.enable_msdo = 1; + + // Add an OPS + cfg.num_ops_sets = 1; + cfg.ops_sets[0].enable = 1; + cfg.ops_sets[0].ops_id = 0; + cfg.ops_sets[0].intent_present_flag = 1; + cfg.ops_sets[0].ptl_present_flag = 1; + cfg.ops_sets[0].num_operating_points = 1; + cfg.ops_sets[0].ops[0].xlayer_map = 0x3; + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + + // Write TD + ASSERT_EQ(tu_assembler_write_td(&ta), 0); + + // Write structural OBUs + ASSERT_EQ(tu_assembler_write_global_lcr(&ta), 0); + ASSERT_EQ(tu_assembler_write_msdo(&ta), 0); + ASSERT_EQ(tu_assembler_write_ops(&ta, GLOBAL_XLAYER_ID), 0); + + // Fake per-xlayer data for xlayer 0 + uint8_t xl0_data[8]; + size_t xl0_size = 0; + uint8_t sb[4]; + size_t sl = 0; + avm_uleb_encode(3, sizeof(sb), sb, &sl); // SH: 1 hdr + 2 payload + memcpy(xl0_data + xl0_size, sb, sl); + xl0_size += sl; + xl0_data[xl0_size++] = 0x04; // SH header + xl0_data[xl0_size++] = 0xAA; + xl0_data[xl0_size++] = 0xBB; + + ASSERT_EQ(tu_assembler_append_xlayer_obus(&ta, 0, xl0_data, xl0_size), 0); + + // Fake per-xlayer data for xlayer 1 + uint8_t xl1_data[8]; + size_t xl1_size = 0; + avm_uleb_encode(3, sizeof(sb), sb, &sl); + memcpy(xl1_data + xl1_size, sb, sl); + xl1_size += sl; + xl1_data[xl1_size++] = 0x04; + xl1_data[xl1_size++] = 0xCC; + xl1_data[xl1_size++] = 0xDD; + + ASSERT_EQ(tu_assembler_append_xlayer_obus(&ta, 1, xl1_data, xl1_size), 0); + + // Verify total output is non-empty and can be parsed + EXPECT_GT(ta.size, 10u); + + // Walk through OBUs to verify ordering: TD, LCR, MSDO, OPS, xl0 SH, xl1 SH + size_t offset = 0; + int obu_count = 0; + int types[16] = {}; + int xlayer_ids[16] = {}; + + while (offset < ta.size && obu_count < 16) { + int type, ext_flag, xlayer_id; + size_t payload_size; + int consumed = ParseObuAt(ta.buffer, ta.size, offset, &type, &ext_flag, + &xlayer_id, &payload_size); + if (consumed <= 0) break; + types[obu_count] = type; + xlayer_ids[obu_count] = xlayer_id; + obu_count++; + offset += consumed; + } + + // Should have at least 6 OBUs + ASSERT_GE(obu_count, 6); + + // First OBU should be TD + EXPECT_EQ(types[0], OBU_TEMPORAL_DELIMITER); + + // LCR should follow + EXPECT_EQ(types[1], OBU_LAYER_CONFIGURATION_RECORD); + EXPECT_EQ(xlayer_ids[1], GLOBAL_XLAYER_ID); + + // MSDO next + EXPECT_EQ(types[2], OBU_MULTI_STREAM_DECODER_OPERATION); + EXPECT_EQ(xlayer_ids[2], GLOBAL_XLAYER_ID); + + // OPS + EXPECT_EQ(types[3], OBU_OPERATING_POINT_SET); + EXPECT_EQ(xlayer_ids[3], GLOBAL_XLAYER_ID); + + // Per-xlayer OBUs: xlayer 0 then xlayer 1 + EXPECT_EQ(types[4], OBU_SEQUENCE_HEADER); + EXPECT_EQ(xlayer_ids[4], 0); + + EXPECT_EQ(types[5], OBU_SEQUENCE_HEADER); + EXPECT_EQ(xlayer_ids[5], 1); + + // Should have consumed all output + EXPECT_EQ(offset, ta.size); + + tu_assembler_free(&ta); +} + +// --- Atlas Population Tests --- + +TEST(TUAssembler, PopulateAtlasEnhancedUniform) { + int ids[] = { 0, 1, 2 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 3, ids); + cfg.enable_atlas = 1; + cfg.atlas_mode = ENHANCED_ATLAS; + cfg.atlas_uniform_spacing = 1; + // All xlayers same size + for (int i = 0; i < 3; i++) { + cfg.xlayers[i].width = 640; + cfg.xlayers[i].height = 480; + } + + AtlasSegmentInfo atlas; + populate_atlas_from_config(&cfg, &atlas); + + EXPECT_EQ(atlas.valid, 1); + EXPECT_EQ(atlas.atlas_segment_mode_idc, ENHANCED_ATLAS); + EXPECT_EQ(atlas.atlas_segment_id, 1); + + // Region info: 3 columns x 1 row, uniform spacing + EXPECT_EQ(atlas.ats_reg_params.ats_uniform_spacing_flag, 1); + EXPECT_EQ(atlas.ats_reg_params.ats_num_region_columns_minus_1, 2); + EXPECT_EQ(atlas.ats_reg_params.ats_num_region_rows_minus_1, 0); + EXPECT_EQ(atlas.ats_reg_params.ats_region_width_minus_1, 639); + EXPECT_EQ(atlas.ats_reg_params.ats_region_height_minus_1, 479); + EXPECT_EQ(atlas.ats_reg_params.NumRegionsInAtlas, 3); + + // Segment mapping: single_region_per_segment + EXPECT_EQ(atlas.ats_reg_seg_map.ats_single_region_per_atlas_segment_flag, 1); + EXPECT_EQ(atlas.ats_reg_seg_map.ats_num_atlas_segments_minus_1, 2); +} + +TEST(TUAssembler, PopulateAtlasEnhancedExplicit2x2) { + // 3 regions in a 2x2 grid (bottom-right empty) + int ids[] = { 0, 1, 2 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 3, ids); + cfg.enable_atlas = 1; + cfg.atlas_mode = ENHANCED_ATLAS; + cfg.atlas_uniform_spacing = 0; + + cfg.xlayers[0].width = 960; + cfg.xlayers[0].height = 540; + cfg.xlayers[0].atlas_pos_x = 0; + cfg.xlayers[0].atlas_pos_y = 0; + + cfg.xlayers[1].width = 960; + cfg.xlayers[1].height = 540; + cfg.xlayers[1].atlas_pos_x = 960; + cfg.xlayers[1].atlas_pos_y = 0; + + cfg.xlayers[2].width = 960; + cfg.xlayers[2].height = 540; + cfg.xlayers[2].atlas_pos_x = 0; + cfg.xlayers[2].atlas_pos_y = 540; + + AtlasSegmentInfo atlas; + populate_atlas_from_config(&cfg, &atlas); + + EXPECT_EQ(atlas.valid, 1); + EXPECT_EQ(atlas.atlas_segment_mode_idc, ENHANCED_ATLAS); + + // Grid should be 2 columns x 2 rows + EXPECT_EQ(atlas.ats_reg_params.ats_uniform_spacing_flag, 0); + EXPECT_EQ(atlas.ats_reg_params.ats_num_region_columns_minus_1, 1); + EXPECT_EQ(atlas.ats_reg_params.ats_num_region_rows_minus_1, 1); + EXPECT_EQ(atlas.ats_reg_params.ats_column_width_minus_1[0], 959); + EXPECT_EQ(atlas.ats_reg_params.ats_column_width_minus_1[1], 959); + EXPECT_EQ(atlas.ats_reg_params.ats_row_height_minus_1[0], 539); + EXPECT_EQ(atlas.ats_reg_params.ats_row_height_minus_1[1], 539); + EXPECT_EQ(atlas.ats_reg_params.NumRegionsInAtlas, 4); + + // Explicit segment mapping (not single_region_per_segment) + EXPECT_EQ(atlas.ats_reg_seg_map.ats_single_region_per_atlas_segment_flag, 0); + EXPECT_EQ(atlas.ats_reg_seg_map.ats_num_atlas_segments_minus_1, 2); + + // Segment 0 at col=0,row=0 + EXPECT_EQ(atlas.ats_reg_seg_map.ats_top_left_region_column[0], 0); + EXPECT_EQ(atlas.ats_reg_seg_map.ats_top_left_region_row[0], 0); + + // Segment 1 at col=1,row=0 + EXPECT_EQ(atlas.ats_reg_seg_map.ats_top_left_region_column[1], 1); + EXPECT_EQ(atlas.ats_reg_seg_map.ats_top_left_region_row[1], 0); + + // Segment 2 at col=0,row=1 + EXPECT_EQ(atlas.ats_reg_seg_map.ats_top_left_region_column[2], 0); + EXPECT_EQ(atlas.ats_reg_seg_map.ats_top_left_region_row[2], 1); +} + +TEST(TUAssembler, PopulateAtlasMultistream) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + cfg.enable_atlas = 1; + cfg.atlas_mode = MULTISTREAM_ATLAS; + cfg.atlas_width = 1920; + cfg.atlas_height = 1080; + cfg.xlayers[0].width = 960; + cfg.xlayers[0].height = 1080; + cfg.xlayers[0].atlas_pos_x = 0; + cfg.xlayers[0].atlas_pos_y = 0; + cfg.xlayers[1].width = 960; + cfg.xlayers[1].height = 1080; + cfg.xlayers[1].atlas_pos_x = 960; + cfg.xlayers[1].atlas_pos_y = 0; + + AtlasSegmentInfo atlas; + populate_atlas_from_config(&cfg, &atlas); + + EXPECT_EQ(atlas.valid, 1); + EXPECT_EQ(atlas.atlas_segment_mode_idc, MULTISTREAM_ATLAS); + EXPECT_EQ(atlas.ats_basic_info_s.ats_stream_id_present, 1); + EXPECT_EQ(atlas.ats_basic_info_s.ats_atlas_width, 1920); + EXPECT_EQ(atlas.ats_basic_info_s.ats_atlas_height, 1080); + EXPECT_EQ(atlas.ats_basic_info_s.ats_num_atlas_segments_minus_1, 1); + + // Segment 0: xlayer_id=0, pos (0,0), 960x1080 + EXPECT_EQ(atlas.ats_basic_info_s.ats_input_stream_id[0], 0); + EXPECT_EQ(atlas.ats_basic_info_s.ats_segment_top_left_pos_x[0], 0); + EXPECT_EQ(atlas.ats_basic_info_s.ats_segment_top_left_pos_y[0], 0); + EXPECT_EQ(atlas.ats_basic_info_s.ats_segment_width[0], 960); + + // Segment 1: xlayer_id=1, pos (960,0), 960x1080 + EXPECT_EQ(atlas.ats_basic_info_s.ats_input_stream_id[1], 1); + EXPECT_EQ(atlas.ats_basic_info_s.ats_segment_top_left_pos_x[1], 960); + EXPECT_EQ(atlas.ats_basic_info_s.ats_segment_width[1], 960); +} + +// --- Atlas OBU Write Tests --- + +TEST(TUAssembler, WriteAtlasEnhancedObu) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + cfg.enable_atlas = 1; + cfg.atlas_mode = ENHANCED_ATLAS; + cfg.atlas_uniform_spacing = 1; + cfg.xlayers[0].width = 960; + cfg.xlayers[0].height = 540; + cfg.xlayers[1].width = 960; + cfg.xlayers[1].height = 540; + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + ASSERT_EQ(tu_assembler_write_atlas(&ta), 0); + + EXPECT_GT(ta.size, 0u); + + // Parse OBU header + int type, ext_flag, xlayer_id; + size_t payload_size; + int consumed = ParseObuAt(ta.buffer, ta.size, 0, &type, &ext_flag, &xlayer_id, + &payload_size); + ASSERT_GT(consumed, 0); + EXPECT_EQ(type, OBU_ATLAS_SEGMENT); + EXPECT_EQ(ext_flag, 1); + EXPECT_EQ(xlayer_id, GLOBAL_XLAYER_ID); + + tu_assembler_free(&ta); +} + +TEST(TUAssembler, WriteAtlasMultistreamObu) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + cfg.enable_atlas = 1; + cfg.atlas_mode = MULTISTREAM_ATLAS; + cfg.atlas_width = 1920; + cfg.atlas_height = 1080; + cfg.xlayers[0].width = 960; + cfg.xlayers[0].height = 1080; + cfg.xlayers[0].atlas_pos_x = 0; + cfg.xlayers[0].atlas_pos_y = 0; + cfg.xlayers[1].width = 960; + cfg.xlayers[1].height = 1080; + cfg.xlayers[1].atlas_pos_x = 960; + cfg.xlayers[1].atlas_pos_y = 0; + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + ASSERT_EQ(tu_assembler_write_atlas(&ta), 0); + + EXPECT_GT(ta.size, 0u); + + int type, ext_flag, xlayer_id; + size_t payload_size; + int consumed = ParseObuAt(ta.buffer, ta.size, 0, &type, &ext_flag, &xlayer_id, + &payload_size); + ASSERT_GT(consumed, 0); + EXPECT_EQ(type, OBU_ATLAS_SEGMENT); + EXPECT_EQ(ext_flag, 1); + EXPECT_EQ(xlayer_id, GLOBAL_XLAYER_ID); + + tu_assembler_free(&ta); +} + +TEST(TUAssembler, AtlasSkippedWhenDisabled) { + int ids[] = { 0, 1 }; + MultiXLayerConfig cfg; + MakeMinimalConfig(&cfg, 2, ids); + cfg.enable_atlas = 0; + + TUAssembler ta; + ASSERT_EQ(tu_assembler_init(&ta, &cfg), 0); + ASSERT_EQ(tu_assembler_write_atlas(&ta), 0); + + EXPECT_EQ(ta.size, 0u); + + tu_assembler_free(&ta); +} + +} // namespace diff --git a/test/xlayer_config_test.cc b/test/xlayer_config_test.cc new file mode 100644 index 0000000000..956936534a --- /dev/null +++ b/test/xlayer_config_test.cc @@ -0,0 +1,1973 @@ +/* + * Copyright (c) 2025, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#include +#include + +#include + +#include "common/xlayer_config.h" +#include "common/xlayer_config_parse.h" +#include "avm/avmcx.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +// Helper: write a string to a temporary file and return the path. +// Uses a static buffer — only one temp file at a time. +static char temp_path[256]; + +const char *WriteTempJson(const char *json_str) { + snprintf(temp_path, sizeof(temp_path), "%s", + testing::TempDir().append("xlayer_test.json").c_str()); + FILE *f = fopen(temp_path, "w"); + EXPECT_NE(f, nullptr); + if (!f) return nullptr; + fputs(json_str, f); + fclose(f); + return temp_path; +} + +// --- Config Init Tests --- + +TEST(XLayerConfig, InitDefaults) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + + EXPECT_EQ(cfg.num_xlayers, 0); + EXPECT_EQ(cfg.enable_global_lcr, 1); + EXPECT_EQ(cfg.lcr_doh_constraint_flag, 1); + EXPECT_EQ(cfg.combined_tu, 1); + EXPECT_EQ(cfg.monotonic_output_order, 1); + EXPECT_EQ(cfg.limit, 0); + EXPECT_EQ(cfg.enable_msdo, 0); + EXPECT_EQ(cfg.enable_atlas, 0); + EXPECT_EQ(cfg.num_ops_sets, 0); + + // Check xlayer defaults + for (int i = 0; i < MAX_NUM_XLAYERS - 1; i++) { + EXPECT_EQ(cfg.xlayers[i].xlayer_id, -1); + EXPECT_EQ(cfg.xlayers[i].qp, -1); + EXPECT_EQ(cfg.xlayers[i].bitrate, -1); + EXPECT_EQ(cfg.xlayers[i].cpu_used, -1); + EXPECT_EQ(cfg.xlayers[i].lag_in_frames, -1); + EXPECT_EQ(cfg.xlayers[i].profile, (unsigned int)MAIN_420_10_IP1); + EXPECT_EQ(cfg.xlayers[i].level, (unsigned int)SEQ_LEVEL_4_0); + EXPECT_EQ(cfg.xlayers[i].num_temporal_layers, 1); + EXPECT_EQ(cfg.xlayers[i].num_embedded_layers, 1); + EXPECT_EQ(cfg.xlayers[i].view_type, VIEW_UNSPECIFIED); + } +} + +// --- JSON Parsing Tests --- + +TEST(XLayerConfigParse, MinimalTwoLayer) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "width": 1920, "height": 1080, + "qp": 128, "cpu_used": 5 }, + { "xlayer_id": 1, "input": "b.raw", "width": 1920, "height": 1080, + "qp": 160, "cpu_used": 5 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.num_xlayers, 2); + EXPECT_EQ(cfg.xlayers[0].xlayer_id, 0); + EXPECT_EQ(cfg.xlayers[1].xlayer_id, 1); + EXPECT_STREQ(cfg.xlayers[0].input_filename, "a.raw"); + EXPECT_STREQ(cfg.xlayers[1].input_filename, "b.raw"); + EXPECT_EQ(cfg.xlayers[0].width, 1920u); + EXPECT_EQ(cfg.xlayers[0].height, 1080u); + EXPECT_EQ(cfg.xlayers[0].qp, 128); + EXPECT_EQ(cfg.xlayers[1].qp, 160); + EXPECT_STREQ(cfg.output_filename, "out.obu"); + + // Defaults should apply + EXPECT_EQ(cfg.enable_global_lcr, 1); + EXPECT_EQ(cfg.combined_tu, 1); +} + +TEST(XLayerConfigParse, LayerTypes) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "layer_type": "texture", + "view_type": "left" }, + { "xlayer_id": 1, "input": "b.raw", "layer_type": "auxiliary", + "auxiliary_type": "depth" }, + { "xlayer_id": 2, "input": "c.raw", "layer_type": "stereo", + "view_type": "right" } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].layer_type, TEXTURE_LAYER); + EXPECT_EQ(cfg.xlayers[0].view_type, VIEW_LEFT); + EXPECT_EQ(cfg.xlayers[1].layer_type, AUX_LAYER); + EXPECT_EQ(cfg.xlayers[1].auxiliary_type, LCR_DEPTH_AUX); + EXPECT_EQ(cfg.xlayers[2].layer_type, STEREO_LAYER); + EXPECT_EQ(cfg.xlayers[2].view_type, VIEW_RIGHT); +} + +TEST(XLayerConfigParse, AllAuxiliaryTypes) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "layer_type": "auxiliary", + "auxiliary_type": "alpha" }, + { "xlayer_id": 1, "input": "b.raw", "layer_type": "auxiliary", + "auxiliary_type": "depth" }, + { "xlayer_id": 2, "input": "c.raw", "layer_type": "auxiliary", + "auxiliary_type": "segmentation" }, + { "xlayer_id": 3, "input": "d.raw", "layer_type": "auxiliary", + "auxiliary_type": "gain_map" } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].auxiliary_type, LCR_ALPHA_AUX); + EXPECT_EQ(cfg.xlayers[1].auxiliary_type, LCR_DEPTH_AUX); + EXPECT_EQ(cfg.xlayers[2].auxiliary_type, LCR_SEGMENTATION_AUX); + EXPECT_EQ(cfg.xlayers[3].auxiliary_type, LCR_GAIN_MAP_AUX); +} + +TEST(XLayerConfigParse, GlobalLcrSection) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw" } + ], + "global_lcr": { + "enable": true, + "purpose_id": 3, + "dependent_xlayers": true, + "doh_constraint": false + } + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.enable_global_lcr, 1); + EXPECT_EQ(cfg.lcr_purpose_id, 3); + EXPECT_EQ(cfg.lcr_dependent_xlayers_flag, 1); + EXPECT_EQ(cfg.lcr_doh_constraint_flag, 0); +} + +TEST(XLayerConfigParse, MsdoSection) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw" } + ], + "msdo": { "enable": true } + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.enable_msdo, 1); +} + +TEST(XLayerConfigParse, OpsSection) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw" }, + { "xlayer_id": 1, "input": "b.raw" } + ], + "ops": [ + { + "ops_id": 0, + "priority": 2, + "intent_present": true, + "ptl_present": true, + "operating_points": [ + { "intent": 0, "xlayer_map": [0] }, + { "intent": 1, "xlayer_map": [0, 1] } + ] + } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.num_ops_sets, 1); + EXPECT_EQ(cfg.ops_sets[0].ops_id, 0); + EXPECT_EQ(cfg.ops_sets[0].priority, 2); + EXPECT_EQ(cfg.ops_sets[0].intent_present_flag, 1); + EXPECT_EQ(cfg.ops_sets[0].ptl_present_flag, 1); + EXPECT_EQ(cfg.ops_sets[0].num_operating_points, 2); + + // OP0: xlayer 0 only => bitmask = 0x1 + EXPECT_EQ(cfg.ops_sets[0].ops[0].intent, 0); + EXPECT_EQ(cfg.ops_sets[0].ops[0].xlayer_map, 1u); + + // OP1: xlayers 0 and 1 => bitmask = 0x3 + EXPECT_EQ(cfg.ops_sets[0].ops[1].intent, 1); + EXPECT_EQ(cfg.ops_sets[0].ops[1].xlayer_map, 3u); +} + +TEST(XLayerConfigParse, EncoderOverrideDefaults) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 5, "input": "a.raw" } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + // Unspecified overrides should be -1 + EXPECT_EQ(cfg.xlayers[0].qp, -1); + EXPECT_EQ(cfg.xlayers[0].bitrate, -1); + EXPECT_EQ(cfg.xlayers[0].cpu_used, -1); + EXPECT_EQ(cfg.xlayers[0].lag_in_frames, -1); + + // Defaults from init + EXPECT_EQ(cfg.xlayers[0].profile, (unsigned int)MAIN_420_10_IP1); + EXPECT_EQ(cfg.xlayers[0].num_temporal_layers, 1); + EXPECT_EQ(cfg.xlayers[0].num_embedded_layers, 1); +} + +// --- Error / Invalid Input Tests --- + +TEST(XLayerConfigParse, NonexistentFile) { + MultiXLayerConfig cfg; + EXPECT_NE(parse_multi_xlayer_config("/nonexistent/path.json", &cfg), 0); +} + +TEST(XLayerConfigParse, InvalidJson) { + const char *json = "{ this is not valid json }}}"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + EXPECT_NE(parse_multi_xlayer_config(path, &cfg), 0); +} + +TEST(XLayerConfigParse, MissingXlayersArray) { + const char *json = R"({ "output": "test.obu" })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + EXPECT_NE(parse_multi_xlayer_config(path, &cfg), 0); +} + +TEST(XLayerConfigParse, XlayerIdOutOfRange) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 31, "input": "a.raw" } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + EXPECT_NE(parse_multi_xlayer_config(path, &cfg), 0); +} + +TEST(XLayerConfigParse, MissingInputField) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0 } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + // Parse succeeds (input is optional when shared source is used) + EXPECT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + // But validation fails (no input and no shared source) + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +// --- Validation Tests --- + +TEST(XLayerConfigValidate, ValidTwoLayers) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 2; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[1].xlayer_id = 1; + snprintf(cfg.xlayers[1].input_filename, PATH_MAX, "b.raw"); + + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, DuplicateXlayerId) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 2; + cfg.xlayers[0].xlayer_id = 3; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[1].xlayer_id = 3; // duplicate + snprintf(cfg.xlayers[1].input_filename, PATH_MAX, "b.raw"); + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, MissingInputFilename) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 1; + cfg.xlayers[0].xlayer_id = 0; + // input_filename left empty + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, OpsReferencesInvalidXlayer) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 1; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + + // OPS references xlayer 5, which doesn't exist + cfg.num_ops_sets = 1; + cfg.ops_sets[0].enable = 1; + cfg.ops_sets[0].num_operating_points = 1; + cfg.ops_sets[0].ops[0].xlayer_map = (1u << 5); // xlayer 5 + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, ZeroXlayers) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 0; + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, NonMonotonicRequiresSameCodingStructure) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 2; + cfg.monotonic_output_order = 0; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].num_temporal_layers = 1; + cfg.xlayers[1].xlayer_id = 1; + snprintf(cfg.xlayers[1].input_filename, PATH_MAX, "b.raw"); + cfg.xlayers[1].num_temporal_layers = 1; + + // Same coding structure — should pass + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); + + // Different num_temporal_layers — should fail + cfg.xlayers[1].num_temporal_layers = 3; + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); + cfg.xlayers[1].num_temporal_layers = 1; // restore + + // Different lag_in_frames — should fail + cfg.xlayers[0].lag_in_frames = 19; + cfg.xlayers[1].lag_in_frames = 35; + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); + cfg.xlayers[1].lag_in_frames = 19; // restore + + // Different kf_max_dist — should fail + cfg.xlayers[0].kf_max_dist = 150; + cfg.xlayers[1].kf_max_dist = 300; + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); + cfg.xlayers[1].kf_max_dist = 150; // restore + + // Different subgop_config — should fail + snprintf(cfg.xlayers[0].subgop_config_path, PATH_MAX, "low_delay.json"); + snprintf(cfg.xlayers[1].subgop_config_path, PATH_MAX, "random_access.json"); + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, MonotonicAllowsDifferentCodingStructure) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 2; + cfg.monotonic_output_order = 1; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].num_temporal_layers = 1; + cfg.xlayers[0].lag_in_frames = 19; + cfg.xlayers[0].kf_max_dist = 150; + cfg.xlayers[1].xlayer_id = 1; + snprintf(cfg.xlayers[1].input_filename, PATH_MAX, "b.raw"); + cfg.xlayers[1].num_temporal_layers = 3; + cfg.xlayers[1].lag_in_frames = 35; + cfg.xlayers[1].kf_max_dist = 300; + + // Different coding structures should be allowed with monotonic=1 + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigParse, CodingStructureFields) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", + "kf_max_dist": 150, + "subgop_config": "low_delay.json" }, + { "xlayer_id": 1, "input": "b.raw", + "kf_max_dist": 300 } + ], + "monotonic_output_order": true + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].kf_max_dist, 150); + EXPECT_STREQ(cfg.xlayers[0].subgop_config_path, "low_delay.json"); + EXPECT_EQ(cfg.xlayers[1].kf_max_dist, 300); + EXPECT_STREQ(cfg.xlayers[1].subgop_config_path, ""); + EXPECT_EQ(cfg.monotonic_output_order, 1); +} + +TEST(XLayerConfigParse, NonMonotonicRejectsMismatch) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "num_temporal_layers": 1 }, + { "xlayer_id": 1, "input": "b.raw", "num_temporal_layers": 3 } + ], + "monotonic_output_order": false + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + // Parsing succeeds but validation should fail + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +// --- Annex G Config File Parsing Tests --- + +// Helper to get the path to a config file in the source tree. +// Relies on AVM_ROOT being the repo root (test runs from build dir). +static std::string CfgPath(const char *relative) { + // Try the source tree relative to the build directory + const char *candidates[] = { + "../avm/cfg/xlayer/", // build dir is sibling of avm/ + "../../avm/cfg/xlayer/", // one level deeper + "../cfg/xlayer/", // build dir inside avm/ + "cfg/xlayer/", // running from repo root + }; + for (const char *prefix : candidates) { + std::string path = std::string(prefix) + relative; + FILE *f = fopen(path.c_str(), "r"); + if (f) { + fclose(f); + return path; + } + } + // Fall back — will fail with a clear error + return std::string("cfg/xlayer/") + relative; +} + +TEST(XLayerConfigAnnexG, G2_360Degree9Xlayer) { + std::string path = CfgPath("annexG2_360degree_9xlayer.json"); + MultiXLayerConfig cfg; + int rc = parse_multi_xlayer_config(path.c_str(), &cfg); + if (rc != 0) { + GTEST_SKIP() << "Config file not found: " << path; + } + + EXPECT_EQ(cfg.num_xlayers, 9); + + // Verify xlayer IDs are 0-8 + for (int i = 0; i < 9; i++) { + EXPECT_EQ(cfg.xlayers[i].xlayer_id, i); + } + + // All subpictures are 1280x640 + for (int i = 0; i < 9; i++) { + EXPECT_EQ(cfg.xlayers[i].width, 1280u); + EXPECT_EQ(cfg.xlayers[i].height, 640u); + } + + // Center viewport (xlayer 4) should have lowest QP (highest quality) + EXPECT_LT(cfg.xlayers[4].qp, cfg.xlayers[0].qp); + + // 3 embedded layers per xlayer + for (int i = 0; i < 9; i++) { + EXPECT_EQ(cfg.xlayers[i].num_embedded_layers, 3); + } + + EXPECT_EQ(cfg.enable_global_lcr, 1); + EXPECT_EQ(cfg.enable_msdo, 0); // MSDO disabled (>4 streams) + + // OPS: 3 operating points + EXPECT_EQ(cfg.num_ops_sets, 1); + EXPECT_EQ(cfg.ops_sets[0].num_operating_points, 3); + + // OP0: center only (xlayer 4) + EXPECT_EQ(cfg.ops_sets[0].ops[0].xlayer_map, (1u << 4)); + + // OP2: all 9 subpictures + uint32_t all9 = (1u << 9) - 1; // bits 0-8 + EXPECT_EQ(cfg.ops_sets[0].ops[2].xlayer_map, all9); + + EXPECT_EQ(cfg.enable_atlas, 1); + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigAnnexG, G3_VideoConf3Xlayer) { + std::string path = CfgPath("annexG3_videoconf_3xlayer.json"); + MultiXLayerConfig cfg; + int rc = parse_multi_xlayer_config(path.c_str(), &cfg); + if (rc != 0) { + GTEST_SKIP() << "Config file not found: " << path; + } + + EXPECT_EQ(cfg.num_xlayers, 3); + EXPECT_EQ(cfg.xlayers[0].xlayer_id, 0); + EXPECT_EQ(cfg.xlayers[1].xlayer_id, 1); + EXPECT_EQ(cfg.xlayers[2].xlayer_id, 2); + + // Main speaker: 1280x1080 + EXPECT_EQ(cfg.xlayers[0].width, 1280u); + EXPECT_EQ(cfg.xlayers[0].height, 1080u); + + // Participant 2: 480x360 (encoded small, upsampled by atlas) + EXPECT_EQ(cfg.xlayers[1].width, 480u); + EXPECT_EQ(cfg.xlayers[1].height, 360u); + + // Participant 3: 640x540 + EXPECT_EQ(cfg.xlayers[2].width, 640u); + EXPECT_EQ(cfg.xlayers[2].height, 540u); + + EXPECT_EQ(cfg.enable_global_lcr, 1); + EXPECT_EQ(cfg.lcr_purpose_id, 6); // Multiview Playback + + // OPS: 3 operating points + EXPECT_EQ(cfg.num_ops_sets, 1); + EXPECT_EQ(cfg.ops_sets[0].num_operating_points, 3); + + // OP0: main speaker only + EXPECT_EQ(cfg.ops_sets[0].ops[0].xlayer_map, (1u << 0)); + + // OP2: all 3 participants + EXPECT_EQ(cfg.ops_sets[0].ops[2].xlayer_map, + (1u << 0) | (1u << 1) | (1u << 2)); + + EXPECT_EQ(cfg.enable_atlas, 1); + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigAnnexG, G4_RoiScalable2Xlayer) { + std::string path = CfgPath("annexG4_roi_scalable_2xlayer.json"); + MultiXLayerConfig cfg; + int rc = parse_multi_xlayer_config(path.c_str(), &cfg); + if (rc != 0) { + GTEST_SKIP() << "Config file not found: " << path; + } + + EXPECT_EQ(cfg.num_xlayers, 2); + EXPECT_EQ(cfg.xlayers[0].xlayer_id, 0); + EXPECT_EQ(cfg.xlayers[1].xlayer_id, 1); + + // Base layer: full stadium 1920x1080 + EXPECT_EQ(cfg.xlayers[0].width, 1920u); + EXPECT_EQ(cfg.xlayers[0].height, 1080u); + + // Enhancement: field-of-play 1280x720 + EXPECT_EQ(cfg.xlayers[1].width, 1280u); + EXPECT_EQ(cfg.xlayers[1].height, 720u); + + // Enhancement should have better quality (lower QP) + EXPECT_LT(cfg.xlayers[1].qp, cfg.xlayers[0].qp); + + EXPECT_EQ(cfg.enable_global_lcr, 1); + EXPECT_EQ(cfg.enable_msdo, 0); + + // OPS: 3 operating points + EXPECT_EQ(cfg.num_ops_sets, 1); + EXPECT_EQ(cfg.ops_sets[0].num_operating_points, 3); + + // OP0: base only + EXPECT_EQ(cfg.ops_sets[0].ops[0].xlayer_map, (1u << 0)); + + // OP1: enhancement only + EXPECT_EQ(cfg.ops_sets[0].ops[1].xlayer_map, (1u << 1)); + + // OP2: both layers + EXPECT_EQ(cfg.ops_sets[0].ops[2].xlayer_map, (1u << 0) | (1u << 1)); + + EXPECT_EQ(cfg.enable_atlas, 1); + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +// --- GOP Config Tests --- + +TEST(XLayerConfigParse, GopModeFields) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", + "gop_mode": "closed", "fwd_kf_enabled": 1 }, + { "xlayer_id": 1, "input": "b.raw", + "gop_mode": "open_leading", "enable_keyframe_filtering": 2 }, + { "xlayer_id": 2, "input": "c.raw", + "gop_mode": "open_sef", "add_sef_for_hidden_frames": 1 } + ], + "monotonic_output_order": false + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].gop_mode, 0); + EXPECT_EQ(cfg.xlayers[0].fwd_kf_enabled, 1); + EXPECT_EQ(cfg.xlayers[1].gop_mode, 1); + EXPECT_EQ(cfg.xlayers[1].enable_keyframe_filtering, 2); + EXPECT_EQ(cfg.xlayers[2].gop_mode, 2); + EXPECT_EQ(cfg.xlayers[2].add_sef_for_hidden_frames, 1); +} + +TEST(XLayerConfigValidate, OpenLeadingRejectedWithMonotonic) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 1; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].gop_mode = 1; // open_leading + cfg.monotonic_output_order = 1; + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, OpenLeadingAllowedWithNonMonotonic) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 2; + cfg.monotonic_output_order = 0; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].gop_mode = 1; + cfg.xlayers[1].xlayer_id = 1; + snprintf(cfg.xlayers[1].input_filename, PATH_MAX, "b.raw"); + cfg.xlayers[1].gop_mode = 1; + + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, NonMonotonicRequiresSameGopMode) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 2; + cfg.monotonic_output_order = 0; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].gop_mode = 0; + cfg.xlayers[1].xlayer_id = 1; + snprintf(cfg.xlayers[1].input_filename, PATH_MAX, "b.raw"); + cfg.xlayers[1].gop_mode = 2; // mismatch + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigParse, GopModeDefaults) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw" } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + // GOP mode defaults: 0 (closed), overrides = -1 (derive) + EXPECT_EQ(cfg.xlayers[0].gop_mode, 0); + EXPECT_EQ(cfg.xlayers[0].fwd_kf_enabled, -1); + EXPECT_EQ(cfg.xlayers[0].enable_keyframe_filtering, -1); + EXPECT_EQ(cfg.xlayers[0].add_sef_for_hidden_frames, -1); +} + +// --- Atlas Config Tests --- + +TEST(XLayerConfigParse, AtlasLayoutFields) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "input": "b.raw", "width": 960, "height": 540, + "atlas_pos_x": 960, "atlas_pos_y": 0 } + ], + "atlas": { + "enable": true, + "mode": 0, + "width": 1920, + "height": 540, + "uniform_spacing": false + } + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.enable_atlas, 1); + EXPECT_EQ(cfg.atlas_mode, 0); + EXPECT_EQ(cfg.atlas_width, 1920); + EXPECT_EQ(cfg.atlas_height, 540); + EXPECT_EQ(cfg.atlas_uniform_spacing, 0); + EXPECT_EQ(cfg.xlayers[0].atlas_pos_x, 0); + EXPECT_EQ(cfg.xlayers[0].atlas_pos_y, 0); + EXPECT_EQ(cfg.xlayers[1].atlas_pos_x, 960); + EXPECT_EQ(cfg.xlayers[1].atlas_pos_y, 0); +} + +// --- Scaling Mode / Embedded Layer Tests --- + +TEST(XLayerConfigParse, ScalingModeInteger) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "width": 1920, "height": 1080, + "num_embedded_layers": 3, + "scaling_mode": [4, 6, 0] } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].num_embedded_layers, 3); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[0], 4); // AVME_ONEFOUR + EXPECT_EQ(cfg.xlayers[0].scaling_mode[1], 6); // AVME_ONETWO + EXPECT_EQ(cfg.xlayers[0].scaling_mode[2], 0); // AVME_NORMAL +} + +TEST(XLayerConfigParse, ScalingModeString) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "width": 1920, "height": 1080, + "num_embedded_layers": 3, + "scaling_mode": ["1/4", "1/2", "1:1"] } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].scaling_mode[0], AVME_ONEFOUR); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[1], AVME_ONETWO); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[2], AVME_NORMAL); +} + +TEST(XLayerConfigParse, ScalingModeAllStringVariants) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", + "num_embedded_layers": 7, + "scaling_mode": ["1/8", "1/4", "1/2", "3/5", "3/4", "4/5", "1:1"] } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].scaling_mode[0], AVME_ONEEIGHT); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[1], AVME_ONEFOUR); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[2], AVME_ONETWO); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[3], AVME_THREEFIVE); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[4], AVME_THREEFOUR); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[5], AVME_FOURFIVE); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[6], AVME_NORMAL); +} + +TEST(XLayerConfigParse, ScalingModeInvalidString) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", + "num_embedded_layers": 2, + "scaling_mode": ["bogus", "1:1"] } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + EXPECT_NE(parse_multi_xlayer_config(path, &cfg), 0); +} + +TEST(XLayerConfigParse, ScalingModeDefaultDerivation2Layers) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", + "num_embedded_layers": 2 } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + // Default for 2 layers: [1/2, 1:1] + EXPECT_EQ(cfg.xlayers[0].scaling_mode[0], AVME_ONETWO); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[1], AVME_NORMAL); +} + +TEST(XLayerConfigParse, ScalingModeDefaultDerivation3Layers) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", + "num_embedded_layers": 3 } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + // Default for 3 layers: [1/4, 1/2, 1:1] + EXPECT_EQ(cfg.xlayers[0].scaling_mode[0], AVME_ONEFOUR); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[1], AVME_ONETWO); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[2], AVME_NORMAL); +} + +TEST(XLayerConfigParse, ScalingModeExplicitOverridesDefault) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", + "num_embedded_layers": 2, + "scaling_mode": ["3/4", "1:1"] } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + // Explicit values override defaults + EXPECT_EQ(cfg.xlayers[0].scaling_mode[0], AVME_THREEFOUR); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[1], AVME_NORMAL); +} + +TEST(XLayerConfigValidate, EmbeddedLayerLastMustBeFullRes) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 1; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].num_embedded_layers = 2; + cfg.xlayers[0].scaling_mode[0] = AVME_ONETWO; + cfg.xlayers[0].scaling_mode[1] = AVME_ONETWO; // Not full-res — invalid + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); + + // Fix it + cfg.xlayers[0].scaling_mode[1] = AVME_NORMAL; + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, EmbeddedLayerOutOfRange) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 1; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].num_embedded_layers = 0; // Invalid + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, EmbeddedLayerInvalidScalingMode) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 1; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].num_embedded_layers = 2; + cfg.xlayers[0].scaling_mode[0] = 99; // Invalid value + cfg.xlayers[0].scaling_mode[1] = AVME_NORMAL; + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, NonMonotonicAllowsDifferentEmbeddedLayers) { + // Different num_embedded_layers is valid — the constraint is that output + // frames within a TU must have matching order hints and synchronized RAPs, + // NOT that embedded layer counts match across xlayers. + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 2; + cfg.monotonic_output_order = 0; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].num_embedded_layers = 3; + cfg.xlayers[0].scaling_mode[0] = AVME_ONEFOUR; + cfg.xlayers[0].scaling_mode[1] = AVME_ONETWO; + cfg.xlayers[1].xlayer_id = 1; + snprintf(cfg.xlayers[1].input_filename, PATH_MAX, "b.raw"); + cfg.xlayers[1].num_embedded_layers = 1; + + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, MonotonicAllowsDifferentEmbeddedLayers) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 2; + cfg.monotonic_output_order = 1; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].num_embedded_layers = 3; + cfg.xlayers[0].scaling_mode[0] = AVME_ONEFOUR; + cfg.xlayers[0].scaling_mode[1] = AVME_ONETWO; + cfg.xlayers[1].xlayer_id = 1; + snprintf(cfg.xlayers[1].input_filename, PATH_MAX, "b.raw"); + cfg.xlayers[1].num_embedded_layers = 1; + + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +// --- Multi-Source Input Tests --- + +TEST(XLayerConfigParse, MultiSourceParsing) { + const char *json = R"({ + "inputs": [ + { "name": "texture", "filename": "video.raw", "width": 1920, + "height": 1080 }, + { "name": "alpha", "filename": "alpha.raw", "width": 1920, + "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "texture", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "input_source": "alpha", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.num_input_sources, 2); + EXPECT_STREQ(cfg.input_sources[0].name, "texture"); + EXPECT_STREQ(cfg.input_sources[0].filename, "video.raw"); + EXPECT_EQ(cfg.input_sources[0].width, 1920u); + EXPECT_EQ(cfg.input_sources[0].height, 1080u); + EXPECT_STREQ(cfg.input_sources[1].name, "alpha"); + EXPECT_STREQ(cfg.input_sources[1].filename, "alpha.raw"); + EXPECT_STREQ(cfg.xlayers[0].input_source_name, "texture"); + EXPECT_STREQ(cfg.xlayers[1].input_source_name, "alpha"); +} + +TEST(XLayerConfigParse, MultiSourceResolution) { + const char *json = R"({ + "inputs": [ + { "name": "texture", "filename": "video.raw", "width": 1920, + "height": 1080 }, + { "name": "alpha", "filename": "alpha.raw", "width": 1920, + "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "texture", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "input_source": "texture", "width": 960, + "height": 540, "atlas_pos_x": 960, "atlas_pos_y": 0 }, + { "xlayer_id": 2, "input_source": "alpha", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + ASSERT_EQ(resolve_input_sources(&cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].input_source_idx, 0); + EXPECT_EQ(cfg.xlayers[1].input_source_idx, 0); + EXPECT_EQ(cfg.xlayers[2].input_source_idx, 1); +} + +TEST(XLayerConfigParse, MultiSourceBackwardCompat) { + const char *json = R"({ + "source": { + "filename": "video.raw", + "width": 1920, + "height": 1080 + }, + "xlayers": [ + { "xlayer_id": 0, "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "width": 960, "height": 540, + "atlas_pos_x": 960, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + // Legacy source is converted to input_sources[0] named "default" + EXPECT_EQ(cfg.num_input_sources, 1); + EXPECT_STREQ(cfg.input_sources[0].name, "default"); + EXPECT_STREQ(cfg.input_sources[0].filename, "video.raw"); + EXPECT_EQ(cfg.input_sources[0].width, 1920u); + EXPECT_EQ(cfg.input_sources[0].height, 1080u); + // Legacy fields still populated + EXPECT_STREQ(cfg.source_filename, "video.raw"); + + // Resolve should assign all xlayers to source 0 + ASSERT_EQ(resolve_input_sources(&cfg), 0); + EXPECT_EQ(cfg.xlayers[0].input_source_idx, 0); + EXPECT_EQ(cfg.xlayers[1].input_source_idx, 0); +} + +TEST(XLayerConfigParse, MultiSourceUnknownName) { + const char *json = R"({ + "inputs": [ + { "name": "texture", "filename": "video.raw", "width": 1920, + "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "nonexistent", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + // Resolve should fail — unknown source name + EXPECT_NE(resolve_input_sources(&cfg), 0); +} + +TEST(XLayerConfigValidate, MultiSourceDuplicateName) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_input_sources = 2; + snprintf(cfg.input_sources[0].name, MAX_SOURCE_NAME_LEN, "texture"); + snprintf(cfg.input_sources[0].filename, PATH_MAX, "a.raw"); + cfg.input_sources[0].width = 1920; + cfg.input_sources[0].height = 1080; + snprintf(cfg.input_sources[1].name, MAX_SOURCE_NAME_LEN, "texture"); + snprintf(cfg.input_sources[1].filename, PATH_MAX, "b.raw"); + cfg.input_sources[1].width = 1920; + cfg.input_sources[1].height = 1080; + + cfg.num_xlayers = 2; + cfg.xlayers[0].xlayer_id = 0; + cfg.xlayers[0].input_source_idx = 0; + cfg.xlayers[0].width = 960; + cfg.xlayers[0].height = 540; + cfg.xlayers[0].atlas_pos_x = 0; + cfg.xlayers[0].atlas_pos_y = 0; + cfg.xlayers[1].xlayer_id = 1; + cfg.xlayers[1].input_source_idx = 1; + cfg.xlayers[1].width = 960; + cfg.xlayers[1].height = 540; + cfg.xlayers[1].atlas_pos_x = 960; + cfg.xlayers[1].atlas_pos_y = 0; + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, MultiSourceChromaValidation) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_input_sources = 1; + snprintf(cfg.input_sources[0].name, MAX_SOURCE_NAME_LEN, "default"); + snprintf(cfg.input_sources[0].filename, PATH_MAX, "v.raw"); + cfg.input_sources[0].width = 1920; + cfg.input_sources[0].height = 1080; + + cfg.num_xlayers = 2; + cfg.xlayers[0].xlayer_id = 0; + cfg.xlayers[0].input_source_idx = 0; + cfg.xlayers[0].width = 960; + cfg.xlayers[0].height = 540; + cfg.xlayers[0].atlas_pos_x = 0; + cfg.xlayers[0].atlas_pos_y = 0; + cfg.xlayers[0].profile = MAIN_420_10_IP1; + cfg.xlayers[1].xlayer_id = 1; + cfg.xlayers[1].input_source_idx = 0; + cfg.xlayers[1].width = 960; + cfg.xlayers[1].height = 540; + cfg.xlayers[1].atlas_pos_x = 960; + cfg.xlayers[1].atlas_pos_y = 0; + cfg.xlayers[1].profile = MAIN_444_10_IP1; // Mismatch + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); + + // Fix chroma mismatch + cfg.xlayers[1].profile = MAIN_420_10_IP1; + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigParse, MultiSourceMixedMode) { + const char *json = R"({ + "inputs": [ + { "name": "texture", "filename": "video.raw", "width": 1920, + "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "texture", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "input": "depth.raw", "width": 1920, + "height": 1080 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + ASSERT_EQ(resolve_input_sources(&cfg), 0); + + // xlayer 0 uses shared source, xlayer 1 uses own file + EXPECT_EQ(cfg.xlayers[0].input_source_idx, 0); + EXPECT_EQ(cfg.xlayers[1].input_source_idx, -1); + EXPECT_STREQ(cfg.xlayers[1].input_filename, "depth.raw"); +} + +TEST(XLayerConfigParse, MultiSourceSingleDefault) { + const char *json = R"({ + "inputs": [ + { "name": "main", "filename": "video.raw", "width": 1920, + "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "width": 960, "height": 540, + "atlas_pos_x": 960, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + ASSERT_EQ(resolve_input_sources(&cfg), 0); + + // Single input source — all unassigned xlayers auto-assign to it + EXPECT_EQ(cfg.xlayers[0].input_source_idx, 0); + EXPECT_EQ(cfg.xlayers[1].input_source_idx, 0); +} + +TEST(XLayerConfigParse, InputsAndSourceMutuallyExclusive) { + const char *json = R"({ + "inputs": [ + { "name": "texture", "filename": "video.raw", "width": 1920, + "height": 1080 } + ], + "source": { + "filename": "video.raw", + "width": 1920, + "height": 1080 + }, + "xlayers": [ + { "xlayer_id": 0, "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + EXPECT_NE(parse_multi_xlayer_config(path, &cfg), 0); +} + +TEST(XLayerConfigParse, MultiSourceAmbiguousNoInputSource) { + const char *json = R"({ + "inputs": [ + { "name": "texture", "filename": "video.raw", "width": 1920, + "height": 1080 }, + { "name": "alpha", "filename": "alpha.raw", "width": 1920, + "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "width": 960, "height": 540, + "atlas_pos_x": 0, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + // Multiple inputs, no explicit input_source — ambiguous, should fail + EXPECT_NE(resolve_input_sources(&cfg), 0); +} + +// --- Frame Rate Tests --- + +TEST(XLayerConfigParse, FrameRateIntegerParsing) { + const char *json = R"({ + "inputs": [ + { "name": "fast", "filename": "a.raw", "width": 1920, "height": 1080, + "frame_rate": 60 }, + { "name": "slow", "filename": "b.raw", "width": 1920, "height": 1080, + "frame_rate": 15 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "fast", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "input_source": "slow", "width": 960, + "height": 540, "atlas_pos_x": 960, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.input_sources[0].frame_rate_num, 60); + EXPECT_EQ(cfg.input_sources[0].frame_rate_den, 1); + EXPECT_EQ(cfg.input_sources[1].frame_rate_num, 15); + EXPECT_EQ(cfg.input_sources[1].frame_rate_den, 1); + + ASSERT_EQ(resolve_input_sources(&cfg), 0); + EXPECT_EQ(cfg.input_sources[0].frame_skip, 1); // 60/60 = 1 + EXPECT_EQ(cfg.input_sources[1].frame_skip, 4); // 60/15 = 4 +} + +TEST(XLayerConfigParse, FrameRateRationalString) { + const char *json = R"({ + "inputs": [ + { "name": "ntsc", "filename": "a.raw", "width": 1920, "height": 1080, + "frame_rate": "30000/1001" }, + { "name": "half", "filename": "b.raw", "width": 1920, "height": 1080, + "frame_rate": "15000/1001" } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "ntsc", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "input_source": "half", "width": 960, + "height": 540, "atlas_pos_x": 960, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.input_sources[0].frame_rate_num, 30000); + EXPECT_EQ(cfg.input_sources[0].frame_rate_den, 1001); + EXPECT_EQ(cfg.input_sources[1].frame_rate_num, 15000); + EXPECT_EQ(cfg.input_sources[1].frame_rate_den, 1001); + + ASSERT_EQ(resolve_input_sources(&cfg), 0); + // 30000/1001 / (15000/1001) = 30000*1001 / (1001*15000) = 2 + EXPECT_EQ(cfg.input_sources[0].frame_skip, 1); + EXPECT_EQ(cfg.input_sources[1].frame_skip, 2); +} + +TEST(XLayerConfigParse, FrameRateNonDivisorFails) { + const char *json = R"({ + "inputs": [ + { "name": "a", "filename": "a.raw", "width": 1920, "height": 1080, + "frame_rate": 30 }, + { "name": "b", "filename": "b.raw", "width": 1920, "height": 1080, + "frame_rate": 24 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "a", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "input_source": "b", "width": 960, + "height": 540, "atlas_pos_x": 960, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + // 30/24 = 1.25, not an integer — should fail + EXPECT_NE(resolve_input_sources(&cfg), 0); +} + +TEST(XLayerConfigParse, FrameRateUnspecifiedAssumesMax) { + const char *json = R"({ + "inputs": [ + { "name": "fast", "filename": "a.raw", "width": 1920, "height": 1080, + "frame_rate": 60 }, + { "name": "auto", "filename": "b.raw", "width": 1920, + "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "fast", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "input_source": "auto", "width": 960, + "height": 540, "atlas_pos_x": 960, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + ASSERT_EQ(resolve_input_sources(&cfg), 0); + + // Unspecified source assumes master rate (skip=1) + EXPECT_EQ(cfg.input_sources[0].frame_skip, 1); + EXPECT_EQ(cfg.input_sources[1].frame_skip, 1); +} + +TEST(XLayerConfigParse, FrameRateAllSameNoSkip) { + const char *json = R"({ + "inputs": [ + { "name": "a", "filename": "a.raw", "width": 1920, "height": 1080, + "frame_rate": 30 }, + { "name": "b", "filename": "b.raw", "width": 1920, "height": 1080, + "frame_rate": 30 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "a", "width": 960, + "height": 540, "atlas_pos_x": 0, "atlas_pos_y": 0 }, + { "xlayer_id": 1, "input_source": "b", "width": 960, + "height": 540, "atlas_pos_x": 960, "atlas_pos_y": 0 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + ASSERT_EQ(resolve_input_sources(&cfg), 0); + + EXPECT_EQ(cfg.input_sources[0].frame_skip, 1); + EXPECT_EQ(cfg.input_sources[1].frame_skip, 1); +} + +// --- Embedded Layers (Per-MLlayer Source) Tests --- + +TEST(XLayerConfigParse, EmbeddedLayersParsing) { + const char *json = R"({ + "inputs": [ + { "name": "left", "filename": "left.raw", "width": 1920, "height": 1080 }, + { "name": "right", "filename": "right.raw", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "left", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", "input_source": "left", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [] }, + { "scaling_mode": "1:1", "input_source": "right", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080, + "depends_on": [0] } + ], + "qp": 128, "cpu_used": 5 } + ], + "output": "stereo_ml.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + ASSERT_EQ(resolve_input_sources(&cfg), 0); + ASSERT_EQ(validate_multi_xlayer_config(&cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].num_embedded_layers, 2); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[0], AVME_ONETWO); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[1], AVME_NORMAL); + EXPECT_EQ(cfg.xlayers[0].has_per_mlayer_sources, 1); + EXPECT_EQ(cfg.xlayers[0].has_mlayer_dependencies, 1); + + // mlayer 0: source "left" + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].input_source_idx, 0); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].dependency_mask, 0); + + // mlayer 1: source "right", depends on mlayer 0 + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].input_source_idx, 1); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].dependency_mask, 1); +} + +TEST(XLayerConfigParse, EmbeddedLayersScalingModeOnly) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "test.raw", "width": 1920, "height": 1080, + "num_embedded_layers": 3, + "embedded_layers": [ + { "scaling_mode": "1/4" }, + { "scaling_mode": "1/2" }, + { "scaling_mode": "1:1" } + ], + "qp": 128, "cpu_used": 5 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].scaling_mode[0], AVME_ONEFOUR); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[1], AVME_ONETWO); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[2], AVME_NORMAL); + EXPECT_EQ(cfg.xlayers[0].has_per_mlayer_sources, 0); + EXPECT_EQ(cfg.xlayers[0].has_mlayer_dependencies, 0); +} + +TEST(XLayerConfigParse, EmbeddedLayersAndScalingModeMutualExclusion) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "test.raw", "width": 1920, "height": 1080, + "num_embedded_layers": 2, + "scaling_mode": ["1/2", "1:1"], + "embedded_layers": [ + { "scaling_mode": "1/2" }, + { "scaling_mode": "1:1" } + ], + "qp": 128, "cpu_used": 5 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + EXPECT_NE(parse_multi_xlayer_config(path, &cfg), 0); +} + +TEST(XLayerConfigParse, EmbeddedLayersCountMismatch) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "test.raw", "width": 1920, "height": 1080, + "num_embedded_layers": 3, + "embedded_layers": [ + { "scaling_mode": "1/2" }, + { "scaling_mode": "1:1" } + ], + "qp": 128, "cpu_used": 5 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + EXPECT_NE(parse_multi_xlayer_config(path, &cfg), 0); +} + +TEST(XLayerConfigParse, EmbeddedLayersDependsOnParsing) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "test.raw", "width": 1920, "height": 1080, + "num_embedded_layers": 3, + "embedded_layers": [ + { "scaling_mode": "1/4", "depends_on": [] }, + { "scaling_mode": "1/2", "depends_on": [0] }, + { "scaling_mode": "1:1" } + ], + "qp": 128, "cpu_used": 5 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + // mlayer 0: depends_on: [] -> mask=0 + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].dependency_mask, 0); + // mlayer 1: depends_on: [0] -> mask=1 + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].dependency_mask, 1); + // mlayer 2: no depends_on -> mask=-1 (default) + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[2].dependency_mask, -1); + EXPECT_EQ(cfg.xlayers[0].has_mlayer_dependencies, 1); +} + +TEST(XLayerConfigResolve, EmbeddedLayersSourceResolution) { + const char *json = R"({ + "inputs": [ + { "name": "main", "filename": "main.raw", "width": 1920, "height": 1080 }, + { "name": "aux", "filename": "aux.raw", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "main", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", "input_source": "main", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080 }, + { "scaling_mode": "1:1", "input_source": "aux", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080 } + ], + "qp": 128, "cpu_used": 5 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + ASSERT_EQ(resolve_input_sources(&cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].input_source_idx, 0); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].input_source_idx, 1); +} + +TEST(XLayerConfigResolve, EmbeddedLayersInheritance) { + const char *json = R"({ + "inputs": [ + { "name": "main", "filename": "main.raw", "width": 3840, "height": 2160 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "main", "width": 1920, "height": 1080, + "atlas_pos_x": 100, "atlas_pos_y": 200, + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2" }, + { "scaling_mode": "1:1" } + ], + "qp": 128, "cpu_used": 5 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + ASSERT_EQ(resolve_input_sources(&cfg), 0); + + // Both mlayers inherit from xlayer — since no mlayer has explicit source, + // has_per_mlayer_sources is 0, so mlayer_sources stay at defaults. + // The encoder uses the xlayer's source for all mlayers automatically. + EXPECT_EQ(cfg.xlayers[0].has_per_mlayer_sources, 0); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].input_source_idx, -1); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].input_source_idx, -1); +} + +TEST(XLayerConfigResolve, EmbeddedLayersUnknownSource) { + const char *json = R"({ + "inputs": [ + { "name": "main", "filename": "main.raw", "width": 1920, "height": 1080 } + ], + "xlayers": [ + { "xlayer_id": 0, "input_source": "main", "width": 1920, "height": 1080, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2" }, + { "scaling_mode": "1:1", "input_source": "nonexistent", + "atlas_pos_x": 0, "atlas_pos_y": 0, "width": 1920, "height": 1080 } + ], + "qp": 128, "cpu_used": 5 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + EXPECT_NE(resolve_input_sources(&cfg), 0); +} + +TEST(XLayerConfigValidate, EmbeddedLayersRequireDimensions) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_input_sources = 2; + snprintf(cfg.input_sources[0].name, MAX_SOURCE_NAME_LEN, "left"); + snprintf(cfg.input_sources[0].filename, PATH_MAX, "left.raw"); + cfg.input_sources[0].width = 1920; + cfg.input_sources[0].height = 1080; + snprintf(cfg.input_sources[1].name, MAX_SOURCE_NAME_LEN, "right"); + snprintf(cfg.input_sources[1].filename, PATH_MAX, "right.raw"); + cfg.input_sources[1].width = 1920; + cfg.input_sources[1].height = 1080; + + cfg.num_xlayers = 1; + cfg.xlayers[0].xlayer_id = 0; + cfg.xlayers[0].input_source_idx = 0; + cfg.xlayers[0].width = 1920; + cfg.xlayers[0].height = 1080; + cfg.xlayers[0].atlas_pos_x = 0; + cfg.xlayers[0].atlas_pos_y = 0; + cfg.xlayers[0].num_embedded_layers = 2; + cfg.xlayers[0].scaling_mode[0] = AVME_ONETWO; + cfg.xlayers[0].scaling_mode[1] = AVME_NORMAL; + cfg.xlayers[0].has_per_mlayer_sources = 1; + + // mlayer 1 has source but no width/height + cfg.xlayers[0].mlayer_sources[0].input_source_idx = 0; + cfg.xlayers[0].mlayer_sources[0].atlas_pos_x = 0; + cfg.xlayers[0].mlayer_sources[0].atlas_pos_y = 0; + cfg.xlayers[0].mlayer_sources[0].width = 1920; + cfg.xlayers[0].mlayer_sources[0].height = 1080; + + cfg.xlayers[0].mlayer_sources[1].input_source_idx = 1; + cfg.xlayers[0].mlayer_sources[1].atlas_pos_x = 0; + cfg.xlayers[0].mlayer_sources[1].atlas_pos_y = 0; + cfg.xlayers[0].mlayer_sources[1].width = 0; // Missing! + cfg.xlayers[0].mlayer_sources[1].height = 0; // Missing! + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); + + // Fix: add dimensions + cfg.xlayers[0].mlayer_sources[1].width = 1920; + cfg.xlayers[0].mlayer_sources[1].height = 1080; + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigValidate, EmbeddedLayersDependsOnRange) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 1; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "test.raw"); + cfg.xlayers[0].width = 1920; + cfg.xlayers[0].height = 1080; + cfg.xlayers[0].num_embedded_layers = 2; + cfg.xlayers[0].scaling_mode[0] = AVME_ONETWO; + cfg.xlayers[0].scaling_mode[1] = AVME_NORMAL; + cfg.xlayers[0].has_mlayer_dependencies = 1; + + // mlayer 0 trying to depend on mlayer 1 (invalid: >= self) + cfg.xlayers[0].mlayer_sources[0].dependency_mask = 0x02; // bit 1 set + + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); + + // Fix: mlayer 0 depends on nothing + cfg.xlayers[0].mlayer_sources[0].dependency_mask = 0; + cfg.xlayers[0].mlayer_sources[1].dependency_mask = 1; // depends on 0 + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigParse, EmbeddedLayersBackwardCompat) { + // Existing flat scaling_mode array should still work + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "test.raw", "width": 1920, "height": 1080, + "num_embedded_layers": 3, + "scaling_mode": ["1/4", "1/2", "1:1"], + "qp": 128, "cpu_used": 5 } + ], + "output": "out.obu" + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + EXPECT_EQ(cfg.xlayers[0].scaling_mode[0], AVME_ONEFOUR); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[1], AVME_ONETWO); + EXPECT_EQ(cfg.xlayers[0].scaling_mode[2], AVME_NORMAL); + EXPECT_EQ(cfg.xlayers[0].has_per_mlayer_sources, 0); + EXPECT_EQ(cfg.xlayers[0].has_mlayer_dependencies, 0); + // No embedded_layers array means mlayer_sources stay at defaults + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].input_source_idx, -1); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].dependency_mask, -1); +} + +// --- Codec Controls Tests --- + +TEST(XLayerConfig, CodecControlsParsing) { + const char *json = R"({ + "inputs": [{ "name": "src", "filename": "test.raw", "width": 64, "height": 64 }], + "xlayers": [{ + "xlayer_id": 0, "input_source": "src", + "width": 64, "height": 64, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "codec_controls": [ + ["enable_deblocking", 0], + ["enable_cdef", 0], + ["enable_intrabc", 0] + ] + }], + "ops": [{ "ops_id": 0, "priority": 0, "intent_present": true, + "ptl_present": true, + "operating_points": [{ "intent": 0, "xlayer_map": [0] }] }], + "output": "/tmp/test_cc.obu" + })"; + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(WriteTempJson(json), &cfg), 0); + EXPECT_EQ(cfg.xlayers[0].num_codec_controls, 3); + EXPECT_STREQ(cfg.xlayers[0].codec_controls[0].name, "enable_deblocking"); + EXPECT_EQ(cfg.xlayers[0].codec_controls[0].value, 0); + EXPECT_STREQ(cfg.xlayers[0].codec_controls[1].name, "enable_cdef"); + EXPECT_EQ(cfg.xlayers[0].codec_controls[1].value, 0); + EXPECT_STREQ(cfg.xlayers[0].codec_controls[2].name, "enable_intrabc"); + EXPECT_EQ(cfg.xlayers[0].codec_controls[2].value, 0); +} + +TEST(XLayerConfig, CodecControlsInvalidFormat) { + // codec_controls entry is not a [name, value] pair + const char *json = R"({ + "inputs": [{ "name": "src", "filename": "test.raw", "width": 64, "height": 64 }], + "xlayers": [{ + "xlayer_id": 0, "input_source": "src", + "width": 64, "height": 64, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "codec_controls": [ + ["enable_deblocking"] + ] + }], + "ops": [{ "ops_id": 0, "priority": 0, "intent_present": true, + "ptl_present": true, + "operating_points": [{ "intent": 0, "xlayer_map": [0] }] }], + "output": "/tmp/test_cc.obu" + })"; + MultiXLayerConfig cfg; + EXPECT_NE(parse_multi_xlayer_config(WriteTempJson(json), &cfg), 0); +} + +TEST(XLayerConfig, CodecControlsEmpty) { + // Empty codec_controls array is valid + const char *json = R"({ + "inputs": [{ "name": "src", "filename": "test.raw", "width": 64, "height": 64 }], + "xlayers": [{ + "xlayer_id": 0, "input_source": "src", + "width": 64, "height": 64, + "atlas_pos_x": 0, "atlas_pos_y": 0, + "codec_controls": [] + }], + "ops": [{ "ops_id": 0, "priority": 0, "intent_present": true, + "ptl_present": true, + "operating_points": [{ "intent": 0, "xlayer_map": [0] }] }], + "output": "/tmp/test_cc.obu" + })"; + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(WriteTempJson(json), &cfg), 0); + EXPECT_EQ(cfg.xlayers[0].num_codec_controls, 0); +} + +// --- Per-MLayer Content Interpretation Tests --- + +TEST(XLayerConfigParse, EmbeddedLayersCIParsing) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "width": 1920, "height": 1080, + "color_primaries": 1, + "transfer_characteristics": 1, + "matrix_coefficients": 1, + "full_range_flag": 0, + "num_embedded_layers": 2, + "embedded_layers": [ + { "scaling_mode": "1/2", + "color_primaries": 9, + "transfer_characteristics": 16, + "matrix_coefficients": 9, + "full_range_flag": 1 }, + { "scaling_mode": "1:1" } + ] + } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + // mlayer 0: explicit CI values + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].color_primaries, 9); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].transfer_characteristics, 16); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].matrix_coefficients, 9); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].full_range_flag, 1); + + // mlayer 1: no CI fields → -1 (inherit from xlayer) + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].color_primaries, -1); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].transfer_characteristics, -1); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].matrix_coefficients, -1); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].full_range_flag, -1); +} + +TEST(XLayerConfigResolve, EmbeddedLayersCIInheritance) { + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "width": 1920, "height": 1080, + "color_primaries": 1, + "transfer_characteristics": 13, + "matrix_coefficients": 6, + "full_range_flag": 0, + "num_embedded_layers": 3, + "embedded_layers": [ + { "scaling_mode": "1/4", + "color_primaries": 9 }, + { "scaling_mode": "1/2" }, + { "scaling_mode": "1:1", + "full_range_flag": 1 } + ] + } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + ASSERT_EQ(resolve_input_sources(&cfg), 0); + resolve_mlayer_ci(&cfg); + + // mlayer 0: explicit color_primaries=9, rest inherit from xlayer + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].color_primaries, 9); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].transfer_characteristics, 13); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].matrix_coefficients, 6); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[0].full_range_flag, 0); + + // mlayer 1: all inherit from xlayer + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].color_primaries, 1); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].transfer_characteristics, 13); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].matrix_coefficients, 6); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[1].full_range_flag, 0); + + // mlayer 2: explicit full_range_flag=1, rest inherit from xlayer + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[2].color_primaries, 1); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[2].transfer_characteristics, 13); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[2].matrix_coefficients, 6); + EXPECT_EQ(cfg.xlayers[0].mlayer_sources[2].full_range_flag, 1); +} + +TEST(XLayerConfigParse, XLayerColorPropagation) { + // Verify xlayer-level color fields are parsed correctly + const char *json = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "width": 1920, "height": 1080, + "color_primaries": 9, + "transfer_characteristics": 16, + "matrix_coefficients": 9, + "full_range_flag": 1 }, + { "xlayer_id": 1, "input": "b.raw", "width": 1920, "height": 1080 } + ] + })"; + const char *path = WriteTempJson(json); + ASSERT_NE(path, nullptr); + + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + + // xlayer 0: explicit color info + EXPECT_EQ(cfg.xlayers[0].color_primaries, 9); + EXPECT_EQ(cfg.xlayers[0].transfer_characteristics, 16); + EXPECT_EQ(cfg.xlayers[0].matrix_coefficients, 9); + EXPECT_EQ(cfg.xlayers[0].full_range_flag, 1); + + // xlayer 1: no color info → -1 (use codec defaults) + EXPECT_EQ(cfg.xlayers[1].color_primaries, -1); + EXPECT_EQ(cfg.xlayers[1].transfer_characteristics, -1); + EXPECT_EQ(cfg.xlayers[1].matrix_coefficients, -1); + EXPECT_EQ(cfg.xlayers[1].full_range_flag, -1); +} + +TEST(XLayerConfigValidate, EmbeddedLayersCIRangeValidation) { + MultiXLayerConfig cfg; + xlayer_config_init(&cfg); + cfg.num_xlayers = 1; + cfg.xlayers[0].xlayer_id = 0; + snprintf(cfg.xlayers[0].input_filename, PATH_MAX, "a.raw"); + cfg.xlayers[0].width = 416; + cfg.xlayers[0].height = 240; + cfg.xlayers[0].num_embedded_layers = 1; + + // Valid: no CI specified (all -1) + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); + + // Invalid: color_primaries = 300 + cfg.xlayers[0].mlayer_sources[0].color_primaries = 300; + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); + cfg.xlayers[0].mlayer_sources[0].color_primaries = -1; + + // Invalid: full_range_flag = 2 + cfg.xlayers[0].mlayer_sources[0].full_range_flag = 2; + EXPECT_NE(validate_multi_xlayer_config(&cfg), 0); + cfg.xlayers[0].mlayer_sources[0].full_range_flag = -1; + + // Valid again + EXPECT_EQ(validate_multi_xlayer_config(&cfg), 0); +} + +TEST(XLayerConfigParse, LimitField) { + // Default: limit=0 (unlimited) + const char *json_no_limit = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "qp": 100 }, + { "xlayer_id": 1, "input": "b.raw", "qp": 160 } + ] + })"; + const char *path = WriteTempJson(json_no_limit); + ASSERT_NE(path, nullptr); + MultiXLayerConfig cfg; + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + EXPECT_EQ(cfg.limit, 0); + + // Explicit limit + const char *json_limit = R"({ + "xlayers": [ + { "xlayer_id": 0, "input": "a.raw", "qp": 100 }, + { "xlayer_id": 1, "input": "b.raw", "qp": 160 } + ], + "limit": 5 + })"; + path = WriteTempJson(json_limit); + ASSERT_NE(path, nullptr); + ASSERT_EQ(parse_multi_xlayer_config(path, &cfg), 0); + EXPECT_EQ(cfg.limit, 5); +} + +} // namespace diff --git a/third_party/cJSON/LICENSE b/third_party/cJSON/LICENSE new file mode 100644 index 0000000000..47234478c1 --- /dev/null +++ b/third_party/cJSON/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2009-2017 Dave Gamble and cJSON contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/third_party/cJSON/cJSON.c b/third_party/cJSON/cJSON.c new file mode 100644 index 0000000000..cd3ff5206e --- /dev/null +++ b/third_party/cJSON/cJSON.c @@ -0,0 +1,362 @@ +/* + Copyright (c) 2009-2017 Dave Gamble and cJSON contributors + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* Minimal cJSON implementation for AVM xlayer config parsing. */ + +#include "third_party/cJSON/cJSON.h" + +#include +#include +#include +#include +#include +#include +#include + +/* --- Internal helpers --- */ + +static cJSON *cJSON_New_Item(void) { + cJSON *node = (cJSON *)calloc(1, sizeof(cJSON)); + return node; +} + +/* Skip whitespace and comments */ +static const char *skip_whitespace(const char *in) { + if (in == NULL) return NULL; + while (*in && (unsigned char)*in <= ' ') in++; + /* Skip // line comments */ + while (*in == '/' && *(in + 1) == '/') { + while (*in && *in != '\n') in++; + while (*in && (unsigned char)*in <= ' ') in++; + } + return in; +} + +/* Forward declarations */ +static const char *parse_value(cJSON *item, const char *value); +static const char *parse_string(cJSON *item, const char *str); +static const char *parse_number(cJSON *item, const char *num); +static const char *parse_array(cJSON *item, const char *value); +static const char *parse_object(cJSON *item, const char *value); + +/* --- Parse string --- */ +static unsigned parse_hex4(const char *str) { + unsigned h = 0; + for (int i = 0; i < 4; i++) { + h <<= 4; + if (*str >= '0' && *str <= '9') + h += (unsigned)(*str - '0'); + else if (*str >= 'A' && *str <= 'F') + h += (unsigned)(10 + *str - 'A'); + else if (*str >= 'a' && *str <= 'f') + h += (unsigned)(10 + *str - 'a'); + else + return 0; + str++; + } + return h; +} + +static const char *parse_string(cJSON *item, const char *str) { + if (*str != '\"') return NULL; + str++; + + const char *start = str; + size_t len = 0; + + /* First pass: compute length */ + while (*str && *str != '\"') { + if (*str == '\\') { + str++; + if (*str == 'u') { + str += 4; + len += 4; /* UTF-8 worst case, simplified */ + } else { + len++; + } + } else { + len++; + } + str++; + } + if (*str != '\"') return NULL; + + char *out = (char *)malloc(len + 1); + if (!out) return NULL; + + str = start; + char *ptr = out; + while (*str && *str != '\"') { + if (*str != '\\') { + *ptr++ = *str++; + } else { + str++; + switch (*str) { + case 'b': *ptr++ = '\b'; break; + case 'f': *ptr++ = '\f'; break; + case 'n': *ptr++ = '\n'; break; + case 'r': *ptr++ = '\r'; break; + case 't': *ptr++ = '\t'; break; + case 'u': { + unsigned uc = parse_hex4(str + 1); + str += 4; + /* Simple UTF-8 encoding */ + if (uc < 0x80) { + *ptr++ = (char)uc; + } else if (uc < 0x800) { + *ptr++ = (char)(0xC0 | (uc >> 6)); + *ptr++ = (char)(0x80 | (uc & 0x3F)); + } else { + *ptr++ = (char)(0xE0 | (uc >> 12)); + *ptr++ = (char)(0x80 | ((uc >> 6) & 0x3F)); + *ptr++ = (char)(0x80 | (uc & 0x3F)); + } + break; + } + default: *ptr++ = *str; break; + } + str++; + } + } + *ptr = '\0'; + + item->valuestring = out; + item->type = cJSON_String; + return str + 1; /* skip closing quote */ +} + +/* --- Parse number --- */ +static const char *parse_number(cJSON *item, const char *num) { + double n = 0; + double sign = 1; + int scale = 0; + int subscale = 0; + int signsubscale = 1; + + if (*num == '-') { + sign = -1; + num++; + } + if (*num == '0') { + num++; + } else if (*num >= '1' && *num <= '9') { + do { + n = n * 10.0 + (*num - '0'); + num++; + } while (*num >= '0' && *num <= '9'); + } + if (*num == '.' && num[1] >= '0' && num[1] <= '9') { + num++; + do { + n = n * 10.0 + (*num - '0'); + scale--; + num++; + } while (*num >= '0' && *num <= '9'); + } + if (*num == 'e' || *num == 'E') { + num++; + if (*num == '+') + num++; + else if (*num == '-') { + signsubscale = -1; + num++; + } + while (*num >= '0' && *num <= '9') { + subscale = subscale * 10 + (*num - '0'); + num++; + } + } + + n = sign * n * pow(10.0, scale + subscale * signsubscale); + + item->valuedouble = n; + item->valueint = (int)n; + item->type = cJSON_Number; + return num; +} + +/* --- Parse array --- */ +static const char *parse_array(cJSON *item, const char *value) { + if (*value != '[') return NULL; + item->type = cJSON_Array; + value = skip_whitespace(value + 1); + if (*value == ']') return value + 1; /* empty array */ + + cJSON *child = cJSON_New_Item(); + if (!child) return NULL; + item->child = child; + value = skip_whitespace(parse_value(child, skip_whitespace(value))); + if (!value) return NULL; + + while (*value == ',') { + cJSON *new_item = cJSON_New_Item(); + if (!new_item) return NULL; + child->next = new_item; + new_item->prev = child; + child = new_item; + value = skip_whitespace(parse_value(child, skip_whitespace(value + 1))); + if (!value) return NULL; + } + + if (*value == ']') return value + 1; + return NULL; /* malformed */ +} + +/* --- Parse object --- */ +static const char *parse_object(cJSON *item, const char *value) { + if (*value != '{') return NULL; + item->type = cJSON_Object; + value = skip_whitespace(value + 1); + if (*value == '}') return value + 1; /* empty object */ + + cJSON *child = cJSON_New_Item(); + if (!child) return NULL; + item->child = child; + + /* Parse key */ + value = parse_string(child, skip_whitespace(value)); + if (!value) return NULL; + child->string = child->valuestring; + child->valuestring = NULL; + child->type = cJSON_Invalid; + + if (*value != ':') return NULL; + value = skip_whitespace(parse_value(child, skip_whitespace(value + 1))); + if (!value) return NULL; + + while (*value == ',') { + cJSON *new_item = cJSON_New_Item(); + if (!new_item) return NULL; + child->next = new_item; + new_item->prev = child; + child = new_item; + + value = parse_string(child, skip_whitespace(value + 1)); + if (!value) return NULL; + child->string = child->valuestring; + child->valuestring = NULL; + child->type = cJSON_Invalid; + + if (*value != ':') return NULL; + value = skip_whitespace(parse_value(child, skip_whitespace(value + 1))); + if (!value) return NULL; + } + + if (*value == '}') return value + 1; + return NULL; /* malformed */ +} + +/* --- Parse value --- */ +static const char *parse_value(cJSON *item, const char *value) { + if (!value) return NULL; + if (!strncmp(value, "null", 4)) { + item->type = cJSON_NULL; + return value + 4; + } + if (!strncmp(value, "false", 5)) { + item->type = cJSON_False; + item->valueint = 0; + return value + 5; + } + if (!strncmp(value, "true", 4)) { + item->type = cJSON_True; + item->valueint = 1; + return value + 4; + } + if (*value == '\"') return parse_string(item, value); + if (*value == '-' || (*value >= '0' && *value <= '9')) + return parse_number(item, value); + if (*value == '[') return parse_array(item, value); + if (*value == '{') return parse_object(item, value); + return NULL; /* failure */ +} + +/* --- Public API --- */ + +cJSON *cJSON_Parse(const char *value) { + cJSON *c = cJSON_New_Item(); + if (!c) return NULL; + const char *end = parse_value(c, skip_whitespace(value)); + if (!end) { + cJSON_Delete(c); + return NULL; + } + return c; +} + +void cJSON_Delete(cJSON *item) { + cJSON *next; + while (item) { + next = item->next; + if (!(item->type & cJSON_IsReference) && item->child) + cJSON_Delete(item->child); + if (!(item->type & cJSON_IsReference) && item->valuestring) + free(item->valuestring); + if (!(item->type & cJSON_StringIsConst) && item->string) + free(item->string); + free(item); + item = next; + } +} + +int cJSON_GetArraySize(const cJSON *array) { + cJSON *child; + int size = 0; + if (!array) return 0; + child = array->child; + while (child) { + size++; + child = child->next; + } + return size; +} + +cJSON *cJSON_GetArrayItem(const cJSON *array, int index) { + if (!array || index < 0) return NULL; + cJSON *child = array->child; + while (child && index > 0) { + child = child->next; + index--; + } + return child; +} + +cJSON *cJSON_GetObjectItemCaseSensitive(const cJSON *object, + const char *string) { + if (!object || !string) return NULL; + cJSON *child = object->child; + while (child) { + if (child->string && strcmp(child->string, string) == 0) return child; + child = child->next; + } + return NULL; +} + +char *cJSON_GetStringValue(const cJSON *item) { + if (!cJSON_IsString(item)) return NULL; + return item->valuestring; +} + +double cJSON_GetNumberValue(const cJSON *item) { + if (!cJSON_IsNumber(item)) return 0.0; + return item->valuedouble; +} diff --git a/third_party/cJSON/cJSON.h b/third_party/cJSON/cJSON.h new file mode 100644 index 0000000000..6432be13d1 --- /dev/null +++ b/third_party/cJSON/cJSON.h @@ -0,0 +1,116 @@ +/* + Copyright (c) 2009-2017 Dave Gamble and cJSON contributors + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* Minimal cJSON implementation for AVM xlayer config parsing. + * Supports: objects, arrays, strings, numbers, booleans, null. + * Based on the cJSON API by Dave Gamble. */ + +#ifndef CJSON_H +#define CJSON_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* cJSON Types: */ +#define cJSON_Invalid (0) +#define cJSON_False (1 << 0) +#define cJSON_True (1 << 1) +#define cJSON_NULL (1 << 2) +#define cJSON_Number (1 << 3) +#define cJSON_String (1 << 4) +#define cJSON_Array (1 << 5) +#define cJSON_Object (1 << 6) +#define cJSON_Raw (1 << 7) + +#define cJSON_IsReference 256 +#define cJSON_StringIsConst 512 + +/* The cJSON structure: */ +typedef struct cJSON { + struct cJSON *next; + struct cJSON *prev; + struct cJSON *child; + + int type; + char *valuestring; + int valueint; + double valuedouble; + char *string; +} cJSON; + +/* Supply a block of JSON, and this returns a cJSON object you can + * interrogate. */ +cJSON *cJSON_Parse(const char *value); + +/* Delete a cJSON entity and all subentities. */ +void cJSON_Delete(cJSON *item); + +/* Returns the number of items in an array (or object). */ +int cJSON_GetArraySize(const cJSON *array); + +/* Retrieve item number "index" from array "array". Returns NULL if + * unsuccessful. */ +cJSON *cJSON_GetArrayItem(const cJSON *array, int index); + +/* Get item "string" from object. Case sensitive. */ +cJSON *cJSON_GetObjectItemCaseSensitive(const cJSON *object, + const char *string); + +/* Check item type */ +#define cJSON_IsInvalid(item) \ + ((item) == NULL || ((item)->type & 0xFF) == cJSON_Invalid) +#define cJSON_IsFalse(item) \ + ((item) != NULL && ((item)->type & 0xFF) == cJSON_False) +#define cJSON_IsTrue(item) \ + ((item) != NULL && ((item)->type & 0xFF) == cJSON_True) +#define cJSON_IsBool(item) \ + ((item) != NULL && (((item)->type & 0xFF) & (cJSON_True | cJSON_False))) +#define cJSON_IsNull(item) \ + ((item) != NULL && ((item)->type & 0xFF) == cJSON_NULL) +#define cJSON_IsNumber(item) \ + ((item) != NULL && ((item)->type & 0xFF) == cJSON_Number) +#define cJSON_IsString(item) \ + ((item) != NULL && ((item)->type & 0xFF) == cJSON_String) +#define cJSON_IsArray(item) \ + ((item) != NULL && ((item)->type & 0xFF) == cJSON_Array) +#define cJSON_IsObject(item) \ + ((item) != NULL && ((item)->type & 0xFF) == cJSON_Object) + +/* Return string value, or NULL */ +char *cJSON_GetStringValue(const cJSON *item); + +/* Return number value, or 0 */ +double cJSON_GetNumberValue(const cJSON *item); + +/* Macro to iterate over array/object children */ +#define cJSON_ArrayForEach(element, array) \ + for (element = (array != NULL) ? (array)->child : NULL; element != NULL; \ + element = element->next) + +#ifdef __cplusplus +} +#endif + +#endif /* CJSON_H */ diff --git a/tools/stream_demuxer.cc b/tools/stream_demuxer.cc index d28f0197da..2f9dbfdfb7 100644 --- a/tools/stream_demuxer.cc +++ b/tools/stream_demuxer.cc @@ -11,6 +11,45 @@ */ #include "tools/stream_mux.h" + +// Parse a Global LCR OBU to discover xlayer IDs. +// Returns the number of xlayers found, or -1 on error. +// stream_ids[] is populated with the xlayer IDs found in the LCR xlayer_map. +static int read_global_lcr_xlayer_map(struct avm_read_bit_buffer *rb, + int *stream_ids) { + // lcr_is_global_flag + const int is_global = avm_rb_read_bit(rb); + if (!is_global) return -1; // Not a global LCR + + // lcr_global_config_record_id (LCR_ID_BITS = 3) + avm_rb_read_literal(rb, LCR_ID_BITS); + + // lcr_xlayer_map (MAX_NUM_XLAYERS - 1 = 31 bits) + const uint32_t xlayer_map = + (uint32_t)avm_rb_read_literal(rb, MAX_NUM_XLAYERS - 1); + + // Extract xlayer IDs from bitmask + int num_xlayers = 0; + for (int i = 0; i < (int)(MAX_NUM_XLAYERS - 1); i++) { + if (xlayer_map & (1u << i)) { + if (num_xlayers >= AVM_MAX_NUM_STREAMS) break; + stream_ids[num_xlayers] = i; + num_xlayers++; + } + } + +#if PRINT_TU_INFO + printf("\n==Parse Global LCR xlayer_map==\n"); + printf("--xlayer_map: 0x%x\n", xlayer_map); + printf("--num_xlayers: %d\n", num_xlayers); + for (int i = 0; i < num_xlayers; i++) { + printf("--xlayer_id[%d]: %d\n", i, stream_ids[i]); + } +#endif // PRINT_TU_INFO + + return num_xlayers; +} + // This function read a multi-stream decoder operation OBU. static int read_multi_stream_decoder_operation(struct avm_read_bit_buffer *rb, int *stream_ids) { @@ -210,6 +249,20 @@ void ExtractTU(const uint8_t *data, int length, int *obu_overhead_bytes, data_ptr + obu_total_size + length_field_size); *num_streams = read_multi_stream_decoder_operation(&rb, stream_ids); per_stream_obus.resize(*num_streams); + } else if (obu_header.type == OBU_LAYER_CONFIGURATION_RECORD && + obu_header.obu_header_extension_flag && + obu_header.obu_xlayer_id == GLOBAL_XLAYER_ID && + *num_streams <= 1) { + // Parse Global LCR to discover xlayer IDs (only if MSDO hasn't already + // set up streams — MSDO takes priority when both are present). + init_read_bit_buffer( + &rb, data_ptr + obu_header_size + static_cast(length_field_size), + data_ptr + obu_total_size + length_field_size); + int lcr_num_streams = read_global_lcr_xlayer_map(&rb, stream_ids); + if (lcr_num_streams > 1) { + *num_streams = lcr_num_streams; + per_stream_obus.resize(*num_streams); + } } else { // Determine which stream this OBU belongs to. int xlayer_id = 0;