diff --git a/VERSIONS b/VERSIONS index 861af6fca..6167d92c2 100644 --- a/VERSIONS +++ b/VERSIONS @@ -183,6 +183,13 @@ multitrait branch: whenever a mutationEffect() callback is added, removed, rescheduled, or activated/deactivated in a way that affects the current tick (invalidating all affected trait values) whenever the tick counter advances such that a previously active mutationEffect() callback becomes inactive, or vice versa (invalidating all affected trait values) whenever the tick counter is changed in script to a new arbitrary value (invalidating all trait values in all individuals in all species) + tree-sequence recording work for multitrait: + individual metadata now has a variable-length per-trait array of phenotype and offset information + mutation metadata is now moved from the mutation table's metadata column to a table placed in top-level metadata with the json+struct codec + mutation metadata now has a variable-length per-trait array of effect size and dominance information + top-level JSON metadata now has a new `traits` key (required), which lists the traits used in the tree sequence; duplicated in provenance + the top-level JSON, mutation, and individual metadata schemas are updated accordingly + pushed the .trees file version from 0.9 to 1.0 version 5.1 (Eidos version 4.1): diff --git a/core/slim_globals.cpp b/core/slim_globals.cpp index 45c8cf48f..8f4d5f0f2 100644 --- a/core/slim_globals.cpp +++ b/core/slim_globals.cpp @@ -123,12 +123,17 @@ void SLiM_WarmUp(void) //std::cout << "sizeof(size_t) == " << sizeof(size_t) << std::endl; // Test that our tskit metadata schemas are valid JSON, and print them out formatted for debugging purposes if desired - nlohmann::json top_level_schema, edge_schema, site_schema, mutation_schema, node_schema, individual_schema, population_schema; + nlohmann::json top_level_JSON_schema, top_level_binary_schema, edge_schema, site_schema, mutation_schema, node_schema, individual_schema, population_schema; try { - top_level_schema = nlohmann::json::parse(gSLiM_tsk_metadata_schema); + top_level_JSON_schema = nlohmann::json::parse(gSLiM_tsk_metadata_JSON_schema); } catch (...) { - EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_metadata_schema must be a JSON string." << EidosTerminate(); + EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_metadata_JSON_schema must be a JSON string." << EidosTerminate(); + } + try { + top_level_binary_schema = nlohmann::json::parse(gSLiM_tsk_metadata_binary_schema_FORMAT); + } catch (...) { + EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_metadata_binary_schema_FORMAT must be a JSON string." << EidosTerminate(); } try { if (gSLiM_tsk_edge_metadata_schema.length()) @@ -143,7 +148,8 @@ void SLiM_WarmUp(void) EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_site_metadata_schema must be a JSON string." << EidosTerminate(); } try { - mutation_schema = nlohmann::json::parse(gSLiM_tsk_mutation_metadata_schema); + if (gSLiM_tsk_mutation_metadata_schema.length()) + mutation_schema = nlohmann::json::parse(gSLiM_tsk_mutation_metadata_schema); } catch (...) { EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_mutation_metadata_schema must be a JSON string." << EidosTerminate(); } @@ -153,9 +159,9 @@ void SLiM_WarmUp(void) EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_node_metadata_schema_FORMAT must be a JSON string." << EidosTerminate(); } try { - individual_schema = nlohmann::json::parse(gSLiM_tsk_individual_metadata_schema); + individual_schema = nlohmann::json::parse(gSLiM_tsk_individual_metadata_schema_FORMAT); } catch (...) { - EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_individual_metadata_schema must be a JSON string." << EidosTerminate(); + EIDOS_TERMINATION << "ERROR (SLiM_WarmUp): (internal error) gSLiM_tsk_individual_metadata_schema_FORMAT must be a JSON string." << EidosTerminate(); } try { population_schema = nlohmann::json::parse(gSLiM_tsk_population_metadata_schema); @@ -165,12 +171,13 @@ void SLiM_WarmUp(void) #if 0 #warning printing of JSON schemas should be disabled in a production build - std::cout << "gSLiM_tsk_metadata_schema == " << std::endl << top_level_schema.dump(4) << std::endl << std::endl; + std::cout << "gSLiM_tsk_metadata_JSON_schema == " << std::endl << top_level_JSON_schema.dump(4) << std::endl << std::endl; + std::cout << "gSLiM_tsk_metadata_binary_schema_FORMAT == " << std::endl << top_level_binary_schema.dump(4) << std::endl << std::endl; std::cout << "gSLiM_tsk_edge_metadata_schema == " << std::endl << edge_schema.dump(4) << std::endl << std::endl; std::cout << "gSLiM_tsk_site_metadata_schema == " << std::endl << site_schema.dump(4) << std::endl << std::endl; std::cout << "gSLiM_tsk_mutation_metadata_schema == " << std::endl << mutation_schema.dump(4) << std::endl << std::endl; std::cout << "gSLiM_tsk_node_metadata_schema_FORMAT == " << std::endl << node_schema.dump(4) << std::endl << std::endl; - std::cout << "gSLiM_tsk_individual_metadata_schema == " << std::endl << individual_schema.dump(4) << std::endl << std::endl; + std::cout << "gSLiM_tsk_individual_metadata_schema_FORMAT == " << std::endl << individual_schema.dump(4) << std::endl << std::endl; std::cout << "gSLiM_tsk_population_metadata_schema == " << std::endl << population_schema.dump(4) << std::endl << std::endl; #endif } @@ -1775,59 +1782,62 @@ void SLiM_ConfigureContext(void) // See the pyslim code for readable versions of these. // For more info on schemas in tskit, see: https://tskit.dev/tskit/docs/stable/metadata.html#sec-metadata - -// BCH 11/7/2021: Since I have been needing to modify these here by hand, I have changed them into C++ raw string literals. -// I have also added some code in SLiM_WarmUp() that checks that the metadata schemas are all valid JSON, and optionally prints them. -// see https://stackoverflow.com/a/5460235/2752221 + +// Note that several schemas now require substitution of a string before they are used, because there are bits +// of the schemas that need to be customized depending on the context (number of chromosomes, number of traits). +// The spots where this substitutuon occurs are marked with things like "%d", "%d1", or "%d2", INCLUDING the +// quotes because the metadata strings given here need to be valid JSON to pass SLiM's internal checks. Those +// are then replaced, in some cases by a quoted string, in other cases by an integer. Schemas that contain +// substitution strings like this have a variable name here ending in "_FORMAT" to indicate that they cannot +// be used directly. + +// BCH 11/7/2021: Since I have been needing to modify these here by hand, I have changed them into C++ raw string +// literals. I have also added some code in SLiM_WarmUp() that checks that the metadata schemas are all valid +// JSON, and optionally prints them. See https://stackoverflow.com/a/5460235/2752221 // This is a useful JSON editor: https://jsoneditoronline.org/ // BCH 12/9/2024: Added the new optional key `chromosomes`, providing an array of information about chromosomes (to support // multiple chromosomes), and the new required key `this_chromosome` specifying information about this file's chromosome. -const std::string gSLiM_tsk_metadata_schema = -R"V0G0N({"$schema":"http://json-schema.org/schema#","codec":"json","examples":[{"SLiM":{"file_version":"0.9","name":"fox","description":"foxes on Catalina island","cycle":123,"tick":123,"model_type":"WF","this_chromosome":{"id":1,"index":0,"symbol":"1","name":"autosome_1","type":"A"},"chromosomes":[{"id":1,"symbol":"1","name":"autosome_1","type":"A"},{"id":35,"symbol":"MT","name":"mtDNA","type":"HF"}],"nucleotide_based":false,"separate_sexes":true,"spatial_dimensionality":"xy","spatial_periodicity":"x"}}],"properties":{"SLiM":{"description":"Top-level metadata for a SLiM tree sequence, file format version 0.9","properties":{"file_version":{"description":"The SLiM 'file format version' of this tree sequence.","type":"string"},"name":{"description":"The SLiM species name represented by this tree sequence.","type":"string"},"description":{"description":"A user-configurable description of the species represented by this tree sequence.","type":"string"},"cycle":{"description":"The 'SLiM cycle' counter when this tree sequence was recorded.","type":"integer"},"tick":{"description":"The 'SLiM tick' counter when this tree sequence was recorded.","type":"integer"},"model_type":{"description":"The model type used for the last part of this simulation (WF or nonWF).","enum":["WF","nonWF"],"type":"string"},"this_chromosome":{"description":"The chromosome represented by the tree sequence in this file.","properties":{"id":{"description":"An integer identifier for the chromosome, unique within this set of tree sequences; often the chromosome number in the organism being represented, such as 1.","type":"integer"},"index":{"description":"The (zero-based) index of this chromosome in the chromosomes metadata array (if present), which should match the information given here.","type":"integer"},"symbol":{"description":"A short string symbol for the chromosome, unique within this set of tree sequences, such as \"1\" or \"MT\".","type":"string"},"name":{"description":"A user-specified name for the chromosome, such as an accession identifier.","type":"string"},"type":{"description":"The type of chromosome, as specified by SLiM.","type":"string"}},"required":["id","index","symbol","type"],"type":"object"},"chromosomes":{"description":"The chromosomes represented by the collection of tree sequences, of which this tree sequence is one member.","items":{"properties":{"id":{"description":"An integer identifier for the chromosome, unique within this set of tree sequences; often the chromosome number in the organism being represented, such as 1.","type":"integer"},"symbol":{"description":"A short string symbol for the chromosome, unique within this set of tree sequences, such as \"1\" or \"MT\".","type":"string"},"name":{"description":"A user-specified name for the chromosome, such as an accession identifier.","type":"string"},"type":{"description":"The type of chromosome, as specified by SLiM.","type":"string"}},"required":["id","symbol","type"],"type":"object"},"type":"array"},"nucleotide_based":{"description":"Whether the simulation was nucleotide-based.","type":"boolean"},"separate_sexes":{"description":"Whether the simulation had separate sexes.","type":"boolean"},"spatial_dimensionality":{"description":"The spatial dimensionality of the simulation.","enum":["","x","xy","xyz"],"type":"string"},"spatial_periodicity":{"description":"The spatial periodicity of the simulation.","enum":["","x","y","z","xy","xz","yz","xyz"],"type":"string"},"stage":{"description":"The stage of the SLiM life cycle when this tree sequence was recorded.","type":"string"}},"required":["model_type","tick","file_version","spatial_dimensionality","spatial_periodicity","this_chromosome","separate_sexes","nucleotide_based"],"type":"object"}},"required":["SLiM"],"type":"object"})V0G0N"; +const std::string gSLiM_tsk_metadata_JSON_schema = +R"V0G0N({"codec":"json","examples":[{"SLiM":{"chromosomes":[{"id":1,"name":"autosome_1","symbol":"1","type":"A"},{"id":35,"name":"mtDNA","symbol":"MT","type":"HF"}],"cycle":123,"description":"foxes on Catalina island","file_version":"1.0","model_type":"WF","name":"fox","nucleotide_based":false,"separate_sexes":true,"spatial_dimensionality":"xy","spatial_periodicity":"x","this_chromosome":{"id":1,"index":0,"name":"autosome_1","symbol":"1","type":"A"},"tick":123,"traits":[{"index":0,"name":"simT","type":"multiplicative"}]}}],"properties":{"SLiM":{"description":"Top-level metadata for a SLiM tree sequence, file format version 1.0","properties":{"chromosomes":{"description":"The chromosomes represented by the collection of tree sequences, of which this tree sequence is one member.","items":{"properties":{"id":{"description":"An integer identifier for the chromosome, unique within this set of tree sequences; often the chromosome number in the organism being represented, such as 1.","type":"integer"},"name":{"description":"A user-specified name for the chromosome, such as an accession identifier.","type":"string"},"symbol":{"description":"A short string symbol for the chromosome, unique within this set of tree sequences, such as \"1\" or \"MT\".","type":"string"},"type":{"description":"The type of chromosome, as specified by SLiM.","type":"string"}},"required":["id","symbol","type"],"type":"object"},"type":"array"},"cycle":{"description":"The 'SLiM cycle' counter when this tree sequence was recorded.","type":"integer"},"description":{"description":"A user-configurable description of the species represented by this tree sequence.","type":"string"},"file_version":{"description":"The SLiM 'file format version' of this tree sequence.","type":"string"},"model_type":{"description":"The model type used for the last part of this simulation (WF or nonWF).","enum":["WF","nonWF"],"type":"string"},"name":{"description":"The SLiM species name represented by this tree sequence.","type":"string"},"nucleotide_based":{"description":"Whether the simulation was nucleotide-based.","type":"boolean"},"separate_sexes":{"description":"Whether the simulation had separate sexes.","type":"boolean"},"spatial_dimensionality":{"description":"The spatial dimensionality of the simulation.","enum":["","x","xy","xyz"],"type":"string"},"spatial_periodicity":{"description":"The spatial periodicity of the simulation.","enum":["","x","y","z","xy","xz","yz","xyz"],"type":"string"},"stage":{"description":"The stage of the SLiM life cycle when this tree sequence was recorded.","type":"string"},"this_chromosome":{"description":"The chromosome represented by the tree sequence in this file.","properties":{"id":{"description":"An integer identifier for the chromosome, unique within this set of tree sequences; often the chromosome number in the organism being represented, such as 1.","type":"integer"},"index":{"description":"The (zero-based) index of this chromosome in the chromosomes metadata array (if present), which should match the information given here.","type":"integer"},"name":{"description":"A user-specified name for the chromosome, such as an accession identifier.","type":"string"},"symbol":{"description":"A short string symbol for the chromosome, unique within this set of tree sequences, such as \"1\" or \"MT\".","type":"string"},"type":{"description":"The type of chromosome, as specified by SLiM.","type":"string"}},"required":["id","index","symbol","type"],"type":"object"},"tick":{"description":"The 'SLiM tick' counter when this tree sequence was recorded.","type":"integer"},"traits":{"description":"The traits defined for this tree sequence; each mutation and individual will have per-trait metadata.","items":{"properties":{"baselineAccumulation":{"description":"Whether the baseline offset includes accumulated effects from fixed (substituted) mutations.","type":"boolean"},"baselineOffset":{"description":"The baseline offset of the trait.","type":"number"},"directFitnessEffect":{"description":"Whether the trait's effects are used directly as fitness effects.","type":"boolean"},"index":{"description":"The integer index for the trait; indices must be sequential starting from zero.","type":"integer"},"individualOffsetMean":{"description":"The mean of the trait's individual offset distribution (which might or might not be used).","type":"number"},"individualOffsetSD":{"description":"The standard deviation of the trait's individual offset distribution (which might or might not be used).","type":"number"},"name":{"description":"The string name for the trait.","type":"string"},"type":{"description":"The type of the trait; this must be 'additive', 'multiplicative', or 'logistic'.","enum":["additive","multiplicative","logistic"],"type":"string"}},"required":["index","name","type"],"type":"object"},"type":"array"}},"required":["model_type","tick","file_version","spatial_dimensionality","spatial_periodicity","this_chromosome","separate_sexes","nucleotide_based","traits"],"type":"object"}},"required":["SLiM"],"type":"object"})V0G0N"; + +// Note the substituted strings "%d1" (the number of padding bytes used to produce 8-byte alignment of this +// binary metadata in the json_binary codec) and "%d2" (the number of traits in the focal species). +const std::string gSLiM_tsk_metadata_binary_schema_FORMAT = +R"V0G0N({"codec":"struct","description":"SLiM schema for binary top-level metadata.","properties":{"aligner":{"binaryFormat":"%d1","index":1,"type":"null"},"mutation_list":{"index":2,"items":{"additionalProperties":false,"properties":{"index":1,"mutation_id":{"binaryFormat":"q","description":"The SLiM mutation ID for this mutation.","type":"integer"},"mutation_type":{"binaryFormat":"i","description":"The id of this mutation's mutationType.","index":2,"type":"integer"},"nucleotide":{"binaryFormat":"b","description":"The nucleotide for this mutation (0=A , 1=C , 2=G, 3=T, or -1 for none)","index":5,"type":"integer"},"per_trait":{"index":6,"items":{"additionalProperties":false,"properties":{"dominance":{"binaryFormat":"f","description":"The dominance coefficient for this trait.","index":2,"type":"number"},"effect_size":{"binaryFormat":"f","description":"The effect size for this trait.","index":1,"type":"number"},"hemizygous_dominance":{"binaryFormat":"f","description":"The hemizygous dominance coefficient for this trait.","index":3,"type":"number"}}},"length":"%d2","type":"array"},"slim_time":{"binaryFormat":"i","description":"The SLiM tick counter when this mutation occurred.","index":4,"type":"integer"},"subpopulation":{"binaryFormat":"i","description":"The ID of the subpopulation this mutation occurred in.","index":3,"type":"integer"}},"required":["mutation_id","mutation_type","slim_time","subpopulation","nucleotide","per_trait"],"type":"object"},"noLengthEncodingExhaustBuffer":true,"type":"array"}},"required":["aligner","mutation_list"],"type":"object"})V0G0N"; const std::string gSLiM_tsk_edge_metadata_schema = ""; const std::string gSLiM_tsk_site_metadata_schema = ""; - -const std::string gSLiM_tsk_mutation_metadata_schema = -R"V0G0N({"$schema":"http://json-schema.org/schema#","additionalProperties":false,"codec":"struct","description":"SLiM schema for mutation metadata.","examples":[{"mutation_list":[{"mutation_type":1,"nucleotide":3,"selection_coeff":-0.2,"slim_time":243,"subpopulation":0}]}],"properties":{"mutation_list":{"items":{"additionalProperties":false,"properties":{"mutation_type":{"binaryFormat":"i","description":"The index of this mutation's mutationType.","index":1,"type":"integer"},"nucleotide":{"binaryFormat":"b","description":"The nucleotide for this mutation (0=A , 1=C , 2=G, 3=T, or -1 for none)","index":5,"type":"integer"},"selection_coeff":{"binaryFormat":"f","description":"This mutation's selection coefficient.","index":2,"type":"number"},"slim_time":{"binaryFormat":"i","description":"The SLiM tick counter when this mutation occurred.","index":4,"type":"integer"},"subpopulation":{"binaryFormat":"i","description":"The ID of the subpopulation this mutation occurred in.","index":3,"type":"integer"}},"required":["mutation_type","selection_coeff","subpopulation","slim_time","nucleotide"],"type":"object"},"noLengthEncodingExhaustBuffer":true,"type":"array"}},"required":["mutation_list"],"type":"object"})V0G0N"; - -// BCH 12/10/2024: Removed the type field, and changed the treatment of is_vacant. We have a -// tricky problem here, which is that is_vacant is now variable-length and there is no count. -// The number of byte (uint8_t) entries in is_vacant depends on the number of chromosomes in -// the full set of tree sequences, because the node metadata has to contain flags (bits) for -// every chromosome, not just for the chromosome represented by this file. So we deduce the -// length of is_vacant from that, but it is variable-length and has no count associated with -// it in the metadata. I think this is actually not allowed in JSON Schema, understandably. -// To make this work, we have to write out a DIFFERENT VERSION OF THIS METADATA SCHEMA -// depending on the number of bytes used. In other words, if 7 bytes of is_vacant data are -// needed (for 49-56 chromosomes), we'd write out a version of the schema that specifies -// 7 bytes of is_vacant data using binaryFormat:7B. This effectively puts the count into the -// schema itself. The number of bytes present can thus be inferred from the schema present -// in the file, but also from the 'chromosomes' top-level metadata key; one bit is taken -// for each chromosome, in order, regardless of their type, providing flags for one node -// table entry for one haplosome of each chromosome. (Remember, there are two node table -// entries per individual; the first corresponds to haplosome 1, so its is_vacant data only -// records is_vacant flags for haplosome 1 of each chromosome, and similarly for the second -// node table entry corresponding to haplosome 2 of each chromosome.) The variable name here -// ends in "_FORMAT" because it is a format string containing `%d`, which must be replaced -// by the correct byte count when it is used for output. See SetCurrentNewIndividual() and -// RecordNewHaplosome() for how this dynamic metadata structure is used in practice, and -// WriteTreeSequenceMetadata() for where this schema format string is used. +const std::string gSLiM_tsk_mutation_metadata_schema = ""; // this is now managed by gSLiM_tsk_metadata_binary_schema in top-level metadata + +// BCH 12/10/2024: Removed the type field, and changed the treatment of is_vacant. We have a tricky problem +// here, which is that is_vacant is now variable-length and there is no count. The number of byte (uint8_t) +// entries in is_vacant depends on the number of chromosomes in the full set of tree sequences, because the node +// metadata has to contain flags (bits) for every chromosome, not just for the chromosome represented by this +// file. So we deduce the length of is_vacant from that, but it is variable-length and has no count associated +// with it in the metadata. This is not allowed in JSON Schema, understandably. To make this work, we have to +// write out a DIFFERENT VERSION OF THIS METADATA SCHEMA depending on the number of bytes used. In other words, +// if 7 bytes of is_vacant data are needed (for 49-56 chromosomes), we'd write out a version of the schema that\ +// specifies 7 bytes of is_vacant data using binaryFormat:7B. This effectively puts the count into the schema +// itself. The number of bytes present can thus be inferred from the schema present in the file, but also from +// the 'chromosomes' top-level metadata key; one bit is taken for each chromosome, in order, regardless of their +// type, providing flags for one node table entry for one haplosome of each chromosome. (Remember, there are two +// node table entries per individual; the first corresponds to haplosome 1, so its is_vacant data only records +// is_vacant flags for haplosome 1 of each chromosome, and similarly for the second node table entry corresponding +// to haplosome 2 of each chromosome.) See SetCurrentNewIndividual() and RecordNewHaplosome() for how this +// dynamic metadata structure is used in practice, and WriteTreeSequenceMetadata() for where this schema is used. // -// BCH 4/10/2025: Changing from binary format 's' to 'B', using the new support for fixed- -// length arrays in tskit: https://github.com/tskit-dev/tskit/issues/3088. This has been -// released in tskit version Python 0.6.1. The new syntax for declaring a fixed-length -// array is: {"type": "array", "length": 3, "items": {"type":"number", "binaryFormat":"B"}}. -// The length, 3 here, is encoded with "%d" and replaced at runtime with the correct count. -// (The string replaced is "%d" *including* the quotes, because the format string needs to -// itself be a legal JSON string in order to pass SLiM's own internal checks, so beware.) +// BCH 4/10/2025: Changing from binary format 's' to 'B', using the new support for fixed-length arrays in tskit: +// https://github.com/tskit-dev/tskit/issues/3088 (released in tskit version Python 0.6.1). The new syntax for +// a fixed-length array is: {"type": "array", "length": 3, "items": {"type":"number", "binaryFormat":"B"}}. The +// length, 3 here, is encoded with "%d" and replaced at runtime with the correct count. const std::string gSLiM_tsk_node_metadata_schema_FORMAT = R"V0G0N({"$schema":"http://json-schema.org/schema#","additionalProperties":false,"codec":"struct","description":"SLiM schema for node metadata.","examples":[{"is_vacant":0,"slim_id":123}],"properties":{"is_vacant":{"description":"A vector of byte (uint8_t) values, with each bit representing whether the node represents a vacant position, either unused or a null haplosome (1), or a non-null haplosome (0), in the corresponding chromosome. This field encodes vacancy for all of the chromosomes in the model, not just the chromosome represented in this file (so that the node table is identical across all chromosomes for a multi-chromosome model). Each chromosome receives one bit here; there are two node table entries per individual, used for the two haplosomes of every chromosome, so only one bit is needed in each entry (making two bits total per chromosome, across the two node table entries). The least significant bit of the first byte is used first (for one haplosome of the first chromosome); the most significant bit of the last byte is used last. The number of bytes present in this field is indicated by this schema's 'binaryFormat' field, which is variable (!), and can also be deduced from the number of chromosomes in the model as given in the top-level 'chromosomes' metadata key, which should always be present if this metadata is present.","index":1,"items":{"binaryFormat":"B","type":"number"},"length":"%d","type":"array"},"slim_id":{"binaryFormat":"q","description":"The 'pedigree ID' of the haplosomes associated with this node in SLiM.","index":0,"type":"integer"}},"required":["slim_id","is_vacant"],"type":["object","null"]})V0G0N"; -const std::string gSLiM_tsk_individual_metadata_schema = -R"V0G0N({"$schema":"http://json-schema.org/schema#","additionalProperties":false,"codec":"struct","description":"SLiM schema for individual metadata.","examples":[{"age":-1,"flags":0,"pedigree_id":123,"pedigree_p1":12,"pedigree_p2":23,"sex":0,"subpopulation":0}],"flags":{"SLIM_INDIVIDUAL_METADATA_MIGRATED":{"description":"Whether this individual was a migrant, either in the tick when the tree sequence was written out (if the individual was alive then), or in the tick of the last time they were Remembered (if not).","value":1}},"properties":{"age":{"binaryFormat":"i","description":"The age of this individual, either when the tree sequence was written out (if the individual was alive then), or the last time they were Remembered (if not).","index":4,"type":"integer"},"flags":{"binaryFormat":"I","description":"Other information about the individual: see 'flags'.","index":7,"type":"integer"},"pedigree_id":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual in SLiM.","index":1,"type":"integer"},"pedigree_p1":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual's first parent in SLiM.","index":2,"type":"integer"},"pedigree_p2":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual's second parent in SLiM.","index":3,"type":"integer"},"sex":{"binaryFormat":"i","description":"The sex of the individual (0 for female, 1 for male, -1 for hermaphrodite).","index":6,"type":"integer"},"subpopulation":{"binaryFormat":"i","description":"The ID of the subpopulation the individual was part of, either when the tree sequence was written out (if the individual was alive then), or the last time they were Remembered (if not).","index":5,"type":"integer"}},"required":["pedigree_id","pedigree_p1","pedigree_p2","age","subpopulation","sex","flags"],"type":"object"})V0G0N"; +// Note the substitute string "%d" (the number of traits in the focal species). +const std::string gSLiM_tsk_individual_metadata_schema_FORMAT = +R"V0G0N({"$schema":"http://json-schema.org/schema#","additionalProperties":false,"codec":"struct","description":"SLiM schema for individual metadata.","examples":[{"age":-1,"flags":0,"pedigree_id":123,"pedigree_p1":12,"pedigree_p2":23,"sex":0,"subpopulation":0,"per_trait":[{"offset":1.0,"phenotype":1.1}]}],"flags":{"SLIM_INDIVIDUAL_METADATA_MIGRATED":{"description":"Whether this individual was a migrant, either in the tick when the tree sequence was written out (if the individual was alive then), or in the tick of the last time they were Remembered (if not).","value":1}},"properties":{"age":{"binaryFormat":"i","description":"The age of this individual, either when the tree sequence was written out (if the individual was alive then), or the last time they were Remembered (if not).","index":4,"type":"integer"},"flags":{"binaryFormat":"I","description":"Other information about the individual: see 'flags'.","index":7,"type":"integer"},"pedigree_id":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual in SLiM.","index":1,"type":"integer"},"pedigree_p1":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual's first parent in SLiM.","index":2,"type":"integer"},"pedigree_p2":{"binaryFormat":"q","description":"The 'pedigree ID' of this individual's second parent in SLiM.","index":3,"type":"integer"},"per_trait":{"index":8,"items":{"additionalProperties":false,"properties":{"offset":{"binaryFormat":"d","description":"The individual offset for this trait.","index":2,"type":"number"},"phenotype":{"binaryFormat":"d","description":"The phenotype for this trait.","index":1,"type":"number"}}},"length":"%d","type":"array"},"sex":{"binaryFormat":"i","description":"The sex of the individual (0 for female, 1 for male, -1 for hermaphrodite).","index":6,"type":"integer"},"subpopulation":{"binaryFormat":"i","description":"The ID of the subpopulation the individual was part of, either when the tree sequence was written out (if the individual was alive then), or the last time they were Remembered (if not).","index":5,"type":"integer"}},"required":["pedigree_id","pedigree_p1","pedigree_p2","age","subpopulation","sex","flags"],"type":"object"})V0G0N"; // This schema was obsoleted in SLiM 3.7; we now use a JSON schema for the population metadata (see below) const std::string gSLiM_tsk_population_metadata_schema_PREJSON = diff --git a/core/slim_globals.h b/core/slim_globals.h index ff875fc64..694cff71d 100644 --- a/core/slim_globals.h +++ b/core/slim_globals.h @@ -683,12 +683,14 @@ enum class IndDomCacheIndex : slim_trait_index_t {}; #define SLIM_TSK_INDIVIDUAL_REMEMBERED ((tsk_flags_t)(1 << 17)) #define SLIM_TSK_INDIVIDUAL_RETAINED ((tsk_flags_t)(1 << 18)) -extern const std::string gSLiM_tsk_metadata_schema; +extern const std::string gSLiM_tsk_metadata_JSON_schema; +extern const std::string gSLiM_tsk_metadata_binary_schema_FORMAT; + extern const std::string gSLiM_tsk_edge_metadata_schema; extern const std::string gSLiM_tsk_site_metadata_schema; extern const std::string gSLiM_tsk_mutation_metadata_schema; extern const std::string gSLiM_tsk_node_metadata_schema_FORMAT; -extern const std::string gSLiM_tsk_individual_metadata_schema; +extern const std::string gSLiM_tsk_individual_metadata_schema_FORMAT; extern const std::string gSLiM_tsk_population_metadata_schema_PREJSON; // before SLiM 3.7 extern const std::string gSLiM_tsk_population_metadata_schema; diff --git a/core/species.cpp b/core/species.cpp index 0598823e9..bf664af04 100644 --- a/core/species.cpp +++ b/core/species.cpp @@ -89,7 +89,8 @@ static const char *SLIM_TREES_FILE_VERSION_META = "0.5"; // SLiM 3.5.x onward, static const char *SLIM_TREES_FILE_VERSION_PREPARENT = "0.6"; // SLiM 3.6.x onward, with SLIM_TSK_INDIVIDUAL_RETAINED instead of SLIM_TSK_INDIVIDUAL_FIRST_GEN static const char *SLIM_TREES_FILE_VERSION_PRESPECIES = "0.7"; // SLiM 3.7.x onward, with parent pedigree IDs in the individuals table metadata static const char *SLIM_TREES_FILE_VERSION_SPECIES = "0.8"; // SLiM 4.0.x onward, with species `name`/`description`, and `tick` in addition to `cycle` -static const char *SLIM_TREES_FILE_VERSION = "0.9"; // SLiM 5.0 onward, for multichrom (haplosomes not genomes, and `chromosomes` key) +static const char *SLIM_TREES_FILE_VERSION_MULTICHROM = "0.9"; // SLiM 5.0 onward, for multichrom (haplosomes not genomes, and `chromosomes` key) +static const char *SLIM_TREES_FILE_VERSION = "1.0"; // SLiM 5.2 onward, for multitrait (per-trait metadata, `traits` key) #pragma mark - #pragma mark Species @@ -6644,6 +6645,8 @@ void Species::AddParentsColumnForOutput(tsk_table_collection_t *p_tables, INDIVI for (tsk_size_t individual_index = 0; individual_index < num_rows; individual_index++) { tsk_id_t tsk_individual = (tsk_id_t)individual_index; + + // note that IndividualMetadataRec is variable-length now, but we only use the fixed-size header portion here IndividualMetadataRec *metadata_rec = (IndividualMetadataRec *)(p_tables->individuals.metadata + p_tables->individuals.metadata_offset[tsk_individual]); slim_pedigreeid_t pedigree_p1 = metadata_rec->pedigree_p1_; slim_pedigreeid_t pedigree_p2 = metadata_rec->pedigree_p2_; @@ -6700,6 +6703,7 @@ void Species::BuildTabledIndividualsHash(tsk_table_collection_t *p_tables, INDIV for (tsk_size_t individual_index = 0; individual_index < num_rows; individual_index++) { + // note that IndividualMetadataRec is variable-length now, but we only use the fixed-size header portion here IndividualMetadataRec *metadata_rec = (IndividualMetadataRec *)(metadata_base + metadata_offset[individual_index]); slim_pedigreeid_t pedigree_id = metadata_rec->pedigree_id_; tsk_id_t tsk_individual = (tsk_id_t)individual_index; @@ -7919,17 +7923,10 @@ void Species::RecordNewDerivedState(const Haplosome *p_haplosome, slim_position_ THREAD_SAFETY_IN_ACTIVE_PARALLEL("Species::RecordNewDerivedState(): usage of statics"); static std::vector derived_mutation_ids; - static std::vector mutation_metadata; - MutationMetadataRec metadata_rec; derived_mutation_ids.resize(0); - mutation_metadata.resize(0); for (Mutation *mutation : p_derived_mutations) - { derived_mutation_ids.emplace_back(mutation->mutation_id_); - MetadataForMutation(mutation, &metadata_rec); - mutation_metadata.emplace_back(metadata_rec); - } // find and incorporate any fixed mutations at this position, which exist in all new derived states but are not included by SLiM // BCH 5/14/2019: Note that this means that derived states will be recorded that look "stacked" even when those mutations would @@ -7945,8 +7942,6 @@ void Species::RecordNewDerivedState(const Haplosome *p_haplosome, slim_position_ Substitution *substitution = position_iter->second; derived_mutation_ids.emplace_back(substitution->mutation_id_); - MetadataForSubstitution(substitution, &metadata_rec); - mutation_metadata.emplace_back(metadata_rec); } // check for time consistency, using the shared node table in treeseq_[0]; this used to be a DEBUG check, but @@ -7962,13 +7957,11 @@ void Species::RecordNewDerivedState(const Haplosome *p_haplosome, slim_position_ // add the mutation table row with the final derived state and metadata char *derived_muts_bytes = (char *)(derived_mutation_ids.data()); size_t derived_state_length = derived_mutation_ids.size() * sizeof(slim_mutationid_t); - char *mutation_metadata_bytes = (char *)(mutation_metadata.data()); - size_t mutation_metadata_length = mutation_metadata.size() * sizeof(MutationMetadataRec); int ret = tsk_mutation_table_add_row(&tsinfo.tables_.mutations, site_id, haplosomeTSKID, TSK_NULL, time, derived_muts_bytes, (tsk_size_t)derived_state_length, - mutation_metadata_bytes, (tsk_size_t)mutation_metadata_length); + NULL, (tsk_size_t)0); if (ret < 0) handle_error("tsk_mutation_table_add_row", ret); } @@ -8204,8 +8197,13 @@ void Species::AddIndividualsToTable(Individual * const *p_individual, size_t p_n location.emplace_back(ind->spatial_y_); location.emplace_back(ind->spatial_z_); - IndividualMetadataRec metadata_rec; - MetadataForIndividual(ind, &metadata_rec); + // the IndividualMetadataRec struct is now variable-size; we want to make a struct of the appropriate + // size for the amount of per-trait metadata that will be present for individuals of this species + slim_trait_index_t trait_count = TraitCount(); + size_t total_metadata_size = sizeof(IndividualMetadataRec) + sizeof(_IndividualPerTraitMetadata) * (trait_count - 1); + uint8_t metadata_buffer[total_metadata_size]; // assumes compiler extension for variable-size stack allocation + IndividualMetadataRec &metadata_rec = *(IndividualMetadataRec *)metadata_buffer; + MetadataForIndividual(ind, &metadata_rec); // requires a metadata record of the appropriate size // do a fast lookup to see whether this individual is already in the individuals table auto ind_pos = p_individuals_hash->find(ped_id); @@ -8215,7 +8213,7 @@ void Species::AddIndividualsToTable(Individual * const *p_individual, size_t p_n tsk_id_t tsk_individual = tsk_individual_table_add_row(&p_tables->individuals, p_flags, location.data(), (uint32_t)location.size(), NULL, 0, // individual parents - (char *)&metadata_rec, (uint32_t)sizeof(IndividualMetadataRec)); + (char *)&metadata_rec, (uint32_t)total_metadata_size); if (tsk_individual < 0) handle_error("tsk_individual_table_add_row", tsk_individual); // Add the new individual to our hash table, for fast lookup as done above @@ -8242,7 +8240,7 @@ void Species::AddIndividualsToTable(Individual * const *p_individual, size_t p_n && (location.size() == (p_tables->individuals.location_offset[tsk_individual + 1] - p_tables->individuals.location_offset[tsk_individual])) - && (sizeof(IndividualMetadataRec) + && (total_metadata_size == (p_tables->individuals.metadata_offset[tsk_individual + 1] - p_tables->individuals.metadata_offset[tsk_individual]))); @@ -8262,7 +8260,7 @@ void Species::AddIndividualsToTable(Individual * const *p_individual, size_t p_n location.data(), location.size() * sizeof(double)); memcpy(p_tables->individuals.metadata + p_tables->individuals.metadata_offset[tsk_individual], - &metadata_rec, sizeof(IndividualMetadataRec)); + &metadata_rec, total_metadata_size); p_tables->individuals.flags[tsk_individual] |= p_flags; // Check node table @@ -8558,13 +8556,18 @@ void Species::WriteTreeSequenceMetadata(tsk_table_collection_t *p_tables, EidosD ////// // Top-level (tree sequence) metadata: - // In the future, we might need to *add* to the metadata *and also* the schema, - // leaving other keys that might already be there. - // But that's being a headache, so we're skipping it. + // + // SLiM now writes top-level metadata with the new "json+struct" codec. So we assemble a JSON string and + // a binary struct, and put them together with a magic byte string, a version number, a header, and schemas. + // See https://github.com/tskit-dev/tskit/pull/3306 for the PR for the json+struct codec and discussion. + // + // NOTE: In the future, we might need to *add* to the metadata *and also* the schema, leaving other keys + // that might already be there. But that's being a headache, so we're skipping it. + // First generate our JSON string, as we did in SLiM 5.1 and earlier: // BCH 3/9/2025: This is now wrapped in try/catch because the JSON library might raise, especially if it dislikes // the model string we try to put in metadata for p_include_model; see https://github.com/MesserLab/SLiM/issues/488 - std::string new_metadata_str; + std::string metadata_JSON_str; try { nlohmann::json metadata; @@ -8666,59 +8669,253 @@ void Species::WriteTreeSequenceMetadata(tsk_table_collection_t *p_tables, EidosD } } - new_metadata_str = metadata.dump(); + metadata["SLiM"]["traits"] = nlohmann::json::array(); + for (Trait *trait : traits_) + { + nlohmann::json trait_info; + + trait_info["index"] = trait->Index(); + trait_info["name"] = trait->Name(); + + if (trait->HasLogisticPostTransform()) + trait_info["type"] = "logistic"; + else if (trait->Type() == TraitType::kAdditive) + trait_info["type"] = "additive"; + else + trait_info["type"] = "multiplicative"; + + trait_info["baselineOffset"] = trait->BaselineOffset(); + trait_info["baselineAccumulation"] = trait->HasBaselineAccumulation(); + + trait_info["individualOffsetMean"] = trait->IndividualOffsetDistributionMean(); + trait_info["individualOffsetSD"] = trait->IndividualOffsetDistributionSD(); + + trait_info["directFitnessEffect"] = trait->HasDirectFitnessEffect(); + + metadata["SLiM"]["traits"].push_back(trait_info); + } + + metadata_JSON_str = metadata.dump(); } catch (const std::exception &e) { EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): a JSON string could not be generated for tree-sequence metadata due to an error: '" << e.what() << "'." << EidosTerminate(); } catch (...) { EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): a JSON string could not be generated for tree-sequence metadata due to an unknown error." << EidosTerminate(); } - ret = tsk_table_collection_set_metadata( - p_tables, new_metadata_str.c_str(), (tsk_size_t)new_metadata_str.length()); + // Then figure out how large our binary data will be (but don't generate the data yet; we will write it + // directly into the buffer that we allocate below, to avoid copying and increased memory overhead) +#warning need to include additional sources of mutations here + int registry_size; + const MutationIndex *register_iter = population_.MutationRegistry(®istry_size); + + const std::vector substitutions = population_.substitutions_; + size_t substitution_count = substitutions.size(); + + size_t row_count = registry_size + substitution_count; + slim_trait_index_t trait_count = TraitCount(); + size_t size_for_additional_trait_info = sizeof(_MutationPerTraitMetadata) * (trait_count - 1); + size_t size_for_one_muttable_row = sizeof(MutationTableMetadataRec) + size_for_additional_trait_info; + size_t mutation_table_size = row_count * size_for_one_muttable_row; + + // Then assemble those two components into a json+struct metadata chunk: + const uint8_t TSK_JSON_BINARY_MAGIC[4] = { 'J', 'B', 'L', 'B' }; + size_t header_length = 4 + 1 + 8 + 8; + size_t json_length = metadata_JSON_str.length(); + size_t binary_length = mutation_table_size; + + // Insert null bytes as needed to align the binary chunk to an 8-byte boundary + size_t aligner_length = 8 - ((header_length + json_length) & 0x07); + + size_t total_length = header_length + json_length + aligner_length + binary_length; + uint8_t *metadata_buffer = (uint8_t *)malloc(total_length); + + if (!metadata_buffer) + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): allocation failed; you may need to raise the memory limit for SLiM." << EidosTerminate(); + + metadata_buffer[0] = TSK_JSON_BINARY_MAGIC[0]; + metadata_buffer[1] = TSK_JSON_BINARY_MAGIC[1]; + metadata_buffer[2] = TSK_JSON_BINARY_MAGIC[2]; + metadata_buffer[3] = TSK_JSON_BINARY_MAGIC[3]; + + metadata_buffer[4] = 1; // version number for the json_blob codec + + Eidos_set_u64_le(metadata_buffer + 5, (uint64_t)json_length); + Eidos_set_u64_le(metadata_buffer + 13, (uint64_t)binary_length); + + memcpy(metadata_buffer + header_length, metadata_JSON_str.data(), json_length); + + for (size_t aligner_index = 0; aligner_index < aligner_length; ++aligner_index) + metadata_buffer[header_length + json_length + aligner_index] = 0; + + // Write mutation metadata into the binary section + uint8_t *row_pointer = metadata_buffer + header_length + json_length + aligner_length; + + for (int registry_index = 0; registry_index < registry_size; ++registry_index) + { + MutationTableMetadataRec *metadata_row = (MutationTableMetadataRec *)row_pointer; + MutationIndex mut_index = register_iter[registry_index]; + Mutation *mut = mutation_block_->MutationForIndex(mut_index); + + metadata_row->mutation_id_ = mut->mutation_id_; + metadata_row->mutation_type_id_ = mut->mutation_type_ptr_->mutation_type_id_; + metadata_row->origin_tick_ = mut->origin_tick_; + metadata_row->subpop_index_ = mut->subpop_index_; + metadata_row->nucleotide_ = mut->nucleotide_; + + metadata_row->unused_[0] = 0; + metadata_row->unused_[1] = 0; + metadata_row->unused_[2] = 0; + + MutationTraitInfo *mut_trait_info_array = mutation_block_->TraitInfoForIndex(mut_index); + _MutationPerTraitMetadata *metadata_trait_info_array = metadata_row->per_trait_; + + for (slim_trait_index_t trait_index = 0; trait_index < trait_count; ++trait_index) + { + MutationTraitInfo *mut_trait_info = mut_trait_info_array + trait_index; + _MutationPerTraitMetadata *metadata_trait_info = metadata_trait_info_array + trait_index; + + metadata_trait_info->effect_size_ = mut_trait_info->effect_size_; + metadata_trait_info->dominance_coeff_ = mut_trait_info->dominance_coeff_UNSAFE_; + metadata_trait_info->hemizygous_dominance_coeff_ = mut_trait_info->hemizygous_dominance_coeff_; + } + + row_pointer += size_for_one_muttable_row; + } + + for (Substitution *sub : substitutions) + { + MutationTableMetadataRec *metadata_row = (MutationTableMetadataRec *)row_pointer; + + metadata_row->mutation_id_ = sub->mutation_id_; + metadata_row->mutation_type_id_ = sub->mutation_type_ptr_->mutation_type_id_; + metadata_row->origin_tick_ = sub->origin_tick_; + metadata_row->subpop_index_ = sub->subpop_index_; + metadata_row->nucleotide_ = sub->nucleotide_; + + metadata_row->unused_[0] = 0; + metadata_row->unused_[1] = 0; + metadata_row->unused_[2] = 0; + + SubstitutionTraitInfo *sub_trait_info_array = sub->trait_info_; + _MutationPerTraitMetadata *metadata_trait_info_array = metadata_row->per_trait_; + + for (slim_trait_index_t trait_index = 0; trait_index < trait_count; ++trait_index) + { + SubstitutionTraitInfo *sub_trait_info = sub_trait_info_array + trait_index; + _MutationPerTraitMetadata *metadata_trait_info = metadata_trait_info_array + trait_index; + + metadata_trait_info->effect_size_ = sub_trait_info->effect_size_; + metadata_trait_info->dominance_coeff_ = sub_trait_info->dominance_coeff_UNSAFE_; + metadata_trait_info->hemizygous_dominance_coeff_ = sub_trait_info->hemizygous_dominance_coeff_; + } + + row_pointer += size_for_one_muttable_row; + } + + // Write out the metadata chunk to the table collection: + ret = tsk_table_collection_set_metadata(p_tables, (const char *)metadata_buffer, (tsk_size_t)total_length); if (ret != 0) handle_error("tsk_table_collection_set_metadata", ret); - - // As above, we maybe ought to edit the metadata schema adding our keys, + + // And finally, generate the bipartite schema for the json+struct metadata chunk: + // + // NOTE: As above, we maybe ought to edit the metadata schema adding our keys, // but then comparing tables is a headache; see tskit#763 + std::string jps_metadata_schema; + + jps_metadata_schema = R"V0G0N({"$schema":"http://json-schema.org/schema#","codec":"json+struct","json":)V0G0N"; + jps_metadata_schema += gSLiM_tsk_metadata_JSON_schema; + jps_metadata_schema += R"V0G0N(,"struct":)V0G0N"; + jps_metadata_schema += gSLiM_tsk_metadata_binary_schema_FORMAT; + jps_metadata_schema += R"V0G0N(})V0G0N"; + + { + // fix "%d1" to have the count of padding bytes needed for 8-byte alignment + size_t d1_pos = jps_metadata_schema.find("\"%d1\""); + + if (d1_pos == std::string::npos) + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) substring \"%d1\" for replacement in schema not found." << EidosTerminate(); + else + jps_metadata_schema.replace(d1_pos, 5, "\"" + std::to_string(aligner_length) + "x\""); // e.g., "%d1" -> "5x" + } + { + // fix "%d2" to have the number of traits in this species + size_t d2_pos = jps_metadata_schema.find("\"%d2\""); + + if (d2_pos == std::string::npos) + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) substring \"%d2\" for replacement in schema not found." << EidosTerminate(); + else + jps_metadata_schema.replace(d2_pos, 5, std::to_string(trait_count)); // e.g., "%d2" -> 3 + } + +#if 0 + // DEBUG: dump the json+struct metadata schema to std::cout + std::cout << std::endl << "jps_metadata_schema == " << std::endl << jps_metadata_schema << std::endl << std::endl; + + nlohmann::json jps_metadata_schema_JSON; + + try { + jps_metadata_schema_JSON = nlohmann::json::parse(jps_metadata_schema); + } catch (...) { + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) jps_metadata_schema must be a JSON string." << EidosTerminate(); + } + + std::cout << "jps_metadata_schema == " << std::endl << jps_metadata_schema_JSON.dump(4) << std::endl << std::endl; +#endif + ret = tsk_table_collection_set_metadata_schema( - p_tables, gSLiM_tsk_metadata_schema.c_str(), (tsk_size_t)gSLiM_tsk_metadata_schema.length()); + p_tables, jps_metadata_schema.data(), (tsk_size_t)jps_metadata_schema.length()); if (ret != 0) handle_error("tsk_table_collection_set_metadata_schema", ret); - + //////////// // Set metadata schema on each table + ret = tsk_edge_table_set_metadata_schema(&p_tables->edges, gSLiM_tsk_edge_metadata_schema.c_str(), (tsk_size_t)gSLiM_tsk_edge_metadata_schema.length()); if (ret != 0) handle_error("tsk_edge_table_set_metadata_schema", ret); + ret = tsk_site_table_set_metadata_schema(&p_tables->sites, gSLiM_tsk_site_metadata_schema.c_str(), (tsk_size_t)gSLiM_tsk_site_metadata_schema.length()); if (ret != 0) handle_error("tsk_site_table_set_metadata_schema", ret); + ret = tsk_mutation_table_set_metadata_schema(&p_tables->mutations, gSLiM_tsk_mutation_metadata_schema.c_str(), (tsk_size_t)gSLiM_tsk_mutation_metadata_schema.length()); if (ret != 0) handle_error("tsk_mutation_table_set_metadata_schema", ret); - ret = tsk_individual_table_set_metadata_schema(&p_tables->individuals, - gSLiM_tsk_individual_metadata_schema.c_str(), - (tsk_size_t)gSLiM_tsk_individual_metadata_schema.length()); - if (ret != 0) - handle_error("tsk_individual_table_set_metadata_schema", ret); + + { + // fix "%d" in the individual metadata to have the number of traits in the species + std::string tsk_individual_metadata_schema = gSLiM_tsk_individual_metadata_schema_FORMAT; + size_t d_pos = tsk_individual_metadata_schema.find("\"%d\""); + + if (d_pos == std::string::npos) + EIDOS_TERMINATION << "ERROR (Species::WriteTreeSequenceMetadata): (internal error) \"%d\" substring missing from gSLiM_tsk_individual_metadata_schema_FORMAT." << EidosTerminate(); + else + jps_metadata_schema.replace(d_pos, 4, std::to_string(TraitCount())); // e.g., "%d" -> 4 + + ret = tsk_individual_table_set_metadata_schema(&p_tables->individuals, + tsk_individual_metadata_schema.c_str(), + (tsk_size_t)tsk_individual_metadata_schema.length()); + if (ret != 0) + handle_error("tsk_individual_table_set_metadata_schema", ret); + } + ret = tsk_population_table_set_metadata_schema(&p_tables->populations, gSLiM_tsk_population_metadata_schema.c_str(), (tsk_size_t)gSLiM_tsk_population_metadata_schema.length()); if (ret != 0) handle_error("tsk_population_table_set_metadata_schema", ret); - // For the node table the schema we save out depends upon the number of - // bits needed to represent the null haplosome structure of the model. - // We allocate one bit per chromosome, in each node table entry (note - // there are two entries per individual, so it ends up being two bits - // of information per chromosome, across the two node table entries.) - // See the big comment on gSLiM_tsk_node_metadata_schema_FORMAT. + // For the node table the schema we save out depends upon the number of bits needed to represent the null + // haplosome structure of the model. We allocate one bit per chromosome, in each node table entry (note + // there are two entries per individual, so it ends up being two bits of information per chromosome, across + // the two node table entries). See the big comment on gSLiM_tsk_node_metadata_schema_FORMAT. std::string tsk_node_metadata_schema = gSLiM_tsk_node_metadata_schema_FORMAT; size_t pos = tsk_node_metadata_schema.find("\"%d\""); std::string count_string = std::to_string(haplosome_metadata_is_vacant_bytes_); @@ -8873,6 +9070,32 @@ void Species::WriteProvenanceTable(tsk_table_collection_t *p_tables, bool p_use_ } } + j["parameters"]["traits"] = nlohmann::json::array(); + for (Trait *trait : traits_) + { + nlohmann::json trait_info; + + trait_info["index"] = trait->Index(); + trait_info["name"] = trait->Name(); + + if (trait->HasLogisticPostTransform()) + trait_info["type"] = "logistic"; + else if (trait->Type() == TraitType::kAdditive) + trait_info["type"] = "additive"; + else + trait_info["type"] = "multiplicative"; + + trait_info["baselineOffset"] = trait->BaselineOffset(); + trait_info["baselineAccumulation"] = trait->HasBaselineAccumulation(); + + trait_info["individualOffsetMean"] = trait->IndividualOffsetDistributionMean(); + trait_info["individualOffsetSD"] = trait->IndividualOffsetDistributionSD(); + + trait_info["directFitnessEffect"] = trait->HasDirectFitnessEffect(); + + j["parameters"]["traits"].push_back(trait_info); + } + if (p_include_model) j["parameters"]["model"] = scriptString; // made model optional in file_version 0.4 j["parameters"]["model_hash"] = scriptHashString; // added model_hash in file_version 0.4 @@ -9024,14 +9247,16 @@ void Species::ReadTreeSequenceMetadata(TreeSeqInfo &p_treeseq, slim_tick_t *p_ti (file_version == SLIM_TREES_FILE_VERSION_META) || (file_version == SLIM_TREES_FILE_VERSION_PREPARENT) || (file_version == SLIM_TREES_FILE_VERSION_PRESPECIES) || - (file_version == SLIM_TREES_FILE_VERSION_SPECIES)) + (file_version == SLIM_TREES_FILE_VERSION_SPECIES) || + (file_version == SLIM_TREES_FILE_VERSION_MULTICHROM)) { - // SLiM 5.0 breaks backward compatibility with earlier file versions + // SLiM 5.2 breaks backward compatibility with earlier file versions EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): the version of this file appears to be too old to be read, or the file is corrupted; you can try using pyslim to bring an old file version forward to the current version, or generate a new file with the current version of SLiM or pyslim." << EidosTerminate(); } else if (file_version == SLIM_TREES_FILE_VERSION) { - *p_file_version = 9; + // FIXME MULTICHROM: note that the reading code has not actually shifted to this version yet! + *p_file_version = 10; } else EIDOS_TERMINATION << "ERROR (Species::ReadTreeSequenceMetadata): this .trees file was generated by an unrecognized version of SLiM or pyslim (internal file version " << file_version << "); this file cannot be read." << EidosTerminate(); @@ -9785,55 +10010,17 @@ void Species::_MakeHaplosomeMetadataRecords(void) // printf("hap_metadata_2M_ == %.2X\n", hap_metadata_2M_->is_vacant_[0]); } -void Species::MetadataForMutation(Mutation *p_mutation, MutationMetadataRec *p_metadata) -{ - static_assert(sizeof(MutationMetadataRec) == 17, "MutationMetadataRec has changed size; this code probably needs to be updated"); - - if (!p_mutation || !p_metadata) - EIDOS_TERMINATION << "ERROR (Species::MetadataForMutation): (internal error) bad parameters to MetadataForMutation()." << EidosTerminate(); - - p_metadata->mutation_type_id_ = p_mutation->mutation_type_ptr_->mutation_type_id_; - - // FIXME MULTITRAIT: We need to figure out where we're going to put multitrait information in .trees - // For now we just write out the effect for trait 0, but we need the dominance coeff too, and we need - // it for all traits in the model not just trait 0; this design is not going to work. See - // https://github.com/MesserLab/SLiM/issues/569 - MutationBlock *mutation_block = p_mutation->mutation_type_ptr_->mutation_block_; - MutationTraitInfo *mut_trait_info = mutation_block->TraitInfoForMutation(p_mutation); - - p_metadata->selection_coeff_ = mut_trait_info[0].effect_size_; - - p_metadata->subpop_index_ = p_mutation->subpop_index_; - p_metadata->origin_tick_ = p_mutation->origin_tick_; - p_metadata->nucleotide_ = p_mutation->nucleotide_; -} - -void Species::MetadataForSubstitution(Substitution *p_substitution, MutationMetadataRec *p_metadata) -{ - static_assert(sizeof(MutationMetadataRec) == 17, "MutationMetadataRec has changed size; this code probably needs to be updated"); - - if (!p_substitution || !p_metadata) - EIDOS_TERMINATION << "ERROR (Species::MetadataForSubstitution): (internal error) bad parameters to MetadataForSubstitution()." << EidosTerminate(); - - p_metadata->mutation_type_id_ = p_substitution->mutation_type_ptr_->mutation_type_id_; - - // FIXME MULTITRAIT: We need to figure out where we're going to put multitrait information in .trees - // For now we just write out the effect for trait 0, but we need the dominance coeff too, and we need - // it for all traits in the model not just trait 0; this design is not going to work. See - // https://github.com/MesserLab/SLiM/issues/569 - p_metadata->selection_coeff_ = p_substitution->trait_info_[0].effect_size_; - - p_metadata->subpop_index_ = p_substitution->subpop_index_; - p_metadata->origin_tick_ = p_substitution->origin_tick_; - p_metadata->nucleotide_ = p_substitution->nucleotide_; -} - void Species::MetadataForIndividual(Individual *p_individual, IndividualMetadataRec *p_metadata) { - static_assert(sizeof(IndividualMetadataRec) == 40, "IndividualMetadataRec has changed size; this code probably needs to be updated"); + // We check the struct size here to detect changes that would need to be responded to here; but it is + // very important to note that the caller guarantees that the actual size of p_metadata is large enough + // to accommodate all of the per-trait metadata, which is variable-length! + static_assert(sizeof(IndividualMetadataRec) == 56, "IndividualMetadataRec has changed size; this code probably needs to be updated"); +#if DEBUG if (!p_individual || !p_metadata) EIDOS_TERMINATION << "ERROR (Species::MetadataForIndividual): (internal error) bad parameters to MetadataForIndividual()." << EidosTerminate(); +#endif p_metadata->pedigree_id_ = p_individual->PedigreeID(); p_metadata->pedigree_p1_ = p_individual->Parent1PedigreeID(); @@ -9845,6 +10032,20 @@ void Species::MetadataForIndividual(Individual *p_individual, IndividualMetadata p_metadata->flags_ = 0; if (p_individual->migrant_) p_metadata->flags_ |= SLIM_INDIVIDUAL_METADATA_MIGRATED; + + // write per-trait metadata + int trait_count = TraitCount(); + _IndividualPerTraitMetadata *per_trait_metadata_array = p_metadata->per_trait_; + IndividualTraitInfo *per_trait_info_array = p_individual->trait_info_; + + for (int trait_index = 0; trait_index < trait_count; ++trait_index) + { + _IndividualPerTraitMetadata &per_trait_metadata = per_trait_metadata_array[trait_index]; + IndividualTraitInfo &per_trait_info = per_trait_info_array[trait_index]; + + per_trait_metadata.phenotype_ = per_trait_info.phenotype_; + per_trait_metadata.offset_ = per_trait_info.offset_; + } } void Species::CheckTreeSeqIntegrity(void) @@ -10216,6 +10417,8 @@ void Species::CrosscheckTreeSeqIntegrity(void) for (tsk_size_t individual_index = 0; individual_index < shared_individuals_table.num_rows; individual_index++) { tsk_id_t tsk_individual = (tsk_id_t)individual_index; + + // note that IndividualMetadataRec is variable-length now, but we only use the fixed-size header portion here IndividualMetadataRec *metadata_rec = (IndividualMetadataRec *)(shared_individuals_table.metadata + shared_individuals_table.metadata_offset[tsk_individual]); slim_pedigreeid_t pedigree_id = metadata_rec->pedigree_id_; auto lookup = tabled_individuals_hash_.find(pedigree_id); @@ -10584,7 +10787,9 @@ void Species::__RemapSubpopulationIDs(SUBPOP_REMAP_HASH &p_subpop_map, TreeSeqIn char *metadata_bytes = ind_table.metadata + ind_table.metadata_offset[ind_index]; tsk_size_t metadata_length = ind_table.metadata_offset[ind_index + 1] - ind_table.metadata_offset[ind_index]; - if (metadata_length != sizeof(IndividualMetadataRec)) + // BCH 2/11/2026: the IndividualMetadataRec struct is now variable-length, but we only need to + // work with the first part of it; so now we just require the size to be >= the base size + if (metadata_length >= sizeof(IndividualMetadataRec)) EIDOS_TERMINATION << "ERROR (Species::__RemapSubpopulationIDs): unexpected individual metadata length; this file cannot be read." << EidosTerminate(); IndividualMetadataRec *metadata = (IndividualMetadataRec *)metadata_bytes; @@ -10723,6 +10928,7 @@ void Species::__TabulateSubpopulationsFromTreeSequence(std::unordered_map &p_derived_mutations); void RetractNewIndividual(void); + void MetadataForIndividual(Individual *p_individual, IndividualMetadataRec *p_metadata); void AddIndividualsToTable(Individual * const *p_individual, size_t p_num_individuals, tsk_table_collection_t *p_tables, INDIVIDUALS_HASH *p_individuals_hash, tsk_flags_t p_flags); void AddLiveIndividualsToIndividualsTable(tsk_table_collection_t *p_tables, INDIVIDUALS_HASH *p_individuals_hash); void FixAliveIndividuals(tsk_table_collection_t *p_tables); diff --git a/core/trait.h b/core/trait.h index 3a3db1861..aa5e2adfe 100644 --- a/core/trait.h +++ b/core/trait.h @@ -154,7 +154,7 @@ class Trait : public EidosDictionaryRetained void _RecacheIndividualOffsetDistribution(void); // caches individualOffsetDistributionFixed_ and individualOffsetDistributionFixedValue_ slim_trait_offset_t _DrawIndividualOffset(void) const; // draws from the distribution defined by individualOffsetDistributionMean_ and individualOffsetDistributionSD_ inline __attribute__((always_inline)) slim_trait_offset_t DrawIndividualOffset(void) const { return (individualOffsetDistributionFixed_) ? individualOffsetDistributionFixedValue_ : _DrawIndividualOffset(); } - //inline __attribute__((always_inline)) slim_trait_offset_t IndividualOffsetDistributionMean(void) const { return individualOffsetDistributionMean_; } // a bit dangerous because of the exp() post-transform; probably nobody should use this + inline __attribute__((always_inline)) slim_trait_offset_t IndividualOffsetDistributionMean(void) const { return individualOffsetDistributionMean_; } // a bit dangerous because of the exp() post-transform; probably nobody should use this inline __attribute__((always_inline)) slim_trait_offset_t IndividualOffsetDistributionSD(void) const { return individualOffsetDistributionSD_; } inline __attribute__((always_inline)) void IndividualOffsetChanged(void) { individualOffsetEverOverridden_ = true; } inline __attribute__((always_inline)) bool IndividualOffsetEverChanged(void) { return individualOffsetEverOverridden_; } diff --git a/eidos/eidos_globals.h b/eidos/eidos_globals.h index 324896d55..d60be558f 100644 --- a/eidos/eidos_globals.h +++ b/eidos/eidos_globals.h @@ -894,6 +894,19 @@ bool Eidos_RegexWorks(void); // Checks that symbol_name does not contain any illegal Unicode characters; used to check identifiers, in particular bool Eidos_ContainsIllegalUnicode(const std::string &symbol_name); +// little-endian write of a uint64_t to an address; taken from tskit/test_core.c in PR https://github.com/tskit-dev/tskit/pull/3306 +inline void Eidos_set_u64_le(uint8_t *dest, uint64_t value) +{ + dest[0] = (uint8_t)(value & 0xFF); + dest[1] = (uint8_t)((value >> 8) & 0xFF); + dest[2] = (uint8_t)((value >> 16) & 0xFF); + dest[3] = (uint8_t)((value >> 24) & 0xFF); + dest[4] = (uint8_t)((value >> 32) & 0xFF); + dest[5] = (uint8_t)((value >> 40) & 0xFF); + dest[6] = (uint8_t)((value >> 48) & 0xFF); + dest[7] = (uint8_t)((value >> 56) & 0xFF); +} + // ******************************************************************************************************************* // diff --git a/treerec/tskit/core.c b/treerec/tskit/core.c index 0f31550a7..5fb6d71a0 100644 --- a/treerec/tskit/core.c +++ b/treerec/tskit/core.c @@ -33,6 +33,9 @@ #include #define UUID_NUM_BYTES 16 +#define TSK_JSON_BINARY_HEADER_SIZE 21 + +static const uint8_t TSK_JSON_BINARY_MAGIC[4] = { 'J', 'B', 'L', 'B' }; #if defined(_WIN32) @@ -95,6 +98,22 @@ get_random_bytes(uint8_t *buf) #endif +static uint64_t +tsk_load_u64_le(const uint8_t *p) +{ + uint64_t value; + + value = (uint64_t) p[0]; + value |= (uint64_t) p[1] << 8; + value |= (uint64_t) p[2] << 16; + value |= (uint64_t) p[3] << 24; + value |= (uint64_t) p[4] << 32; + value |= (uint64_t) p[5] << 40; + value |= (uint64_t) p[6] << 48; + value |= (uint64_t) p[7] << 56; + return value; +} + /* Generate a new UUID4 using a system-generated source of randomness. * Note that this function writes a NULL terminator to the end of this * string, so that the total length of the buffer must be 37 bytes. @@ -121,6 +140,67 @@ tsk_generate_uuid(char *dest, int TSK_UNUSED(flags)) out: return ret; } + +int +tsk_json_struct_metadata_get_blob(const char *metadata, tsk_size_t metadata_length, + const char **json, tsk_size_t *json_length, const uint8_t **blob, + tsk_size_t *blob_length) +{ + int ret; + uint8_t version; + uint64_t json_length_u64; + uint64_t binary_length_u64; + uint64_t header_and_json_length; + uint64_t total_length; + const uint8_t *bytes; + const uint8_t *blob_start; + const char *json_start; + + if (metadata == NULL || json == NULL || json_length == NULL || blob == NULL + || blob_length == NULL) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + bytes = (const uint8_t *) metadata; + if (metadata_length < TSK_JSON_BINARY_HEADER_SIZE) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + if (memcmp(bytes, TSK_JSON_BINARY_MAGIC, sizeof(TSK_JSON_BINARY_MAGIC)) != 0) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + version = bytes[4]; + if (version != 1) { + ret = tsk_trace_error(TSK_ERR_FILE_VERSION_TOO_NEW); + goto out; + } + json_length_u64 = tsk_load_u64_le(bytes + 5); + binary_length_u64 = tsk_load_u64_le(bytes + 13); + if (json_length_u64 > UINT64_MAX - (uint64_t) TSK_JSON_BINARY_HEADER_SIZE) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + header_and_json_length = (uint64_t) TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; + if (binary_length_u64 > UINT64_MAX - header_and_json_length) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + total_length = header_and_json_length + binary_length_u64; + if ((uint64_t) metadata_length < total_length) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + json_start = (const char *) bytes + TSK_JSON_BINARY_HEADER_SIZE; + blob_start = bytes + TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; + *json = json_start; + *json_length = (tsk_size_t) json_length_u64; + *blob = blob_start; + *blob_length = (tsk_size_t) binary_length_u64; + ret = 0; +out: + return ret; +} static const char * tsk_strerror_internal(int err) { diff --git a/treerec/tskit/core.h b/treerec/tskit/core.h index 481905b7a..9d5ce2087 100644 --- a/treerec/tskit/core.h +++ b/treerec/tskit/core.h @@ -1096,6 +1096,31 @@ bool tsk_isfinite(double val); #define TSK_UUID_SIZE 36 int tsk_generate_uuid(char *dest, int flags); +/** +@brief Extract the binary payload from ``json+struct`` encoded metadata. + +@rst +Metadata produced by :py:class:`tskit.metadata.JSONStructCodec` consists of a fixed-size +header followed by canonical JSON bytes and an optional binary payload. This helper +validates the framing, returning pointers to the embedded JSON and binary sections +without copying. + +The output pointers reference memory owned by the caller and remain valid only while +the original metadata buffer is alive. +@endrst + +@param[in] metadata Pointer to the encoded metadata bytes. +@param[in] metadata_length Number of bytes available at ``metadata``. +@param[out] json On success, set to the start of the JSON bytes. +@param[out] json_length On success, set to the JSON length in bytes. +@param[out] blob On success, set to the start of the binary payload. +@param[out] blob_length On success, set to the payload length in bytes. +@return 0 on success, or a :ref:`TSK_ERR ` code on failure. +*/ +int tsk_json_struct_metadata_get_blob(const char *metadata, tsk_size_t metadata_length, + const char **json, tsk_size_t *json_length, const uint8_t **blob, + tsk_size_t *blob_length); + /* TODO most of these can probably be macros so they compile out as no-ops. * Lets do the 64 bit tsk_size_t switch first though. */ void *tsk_malloc(tsk_size_t size);