diff --git a/cite_seq_count/__main__.py b/cite_seq_count/__main__.py index bf1c3ba..274ff9e 100755 --- a/cite_seq_count/__main__.py +++ b/cite_seq_count/__main__.py @@ -558,7 +558,7 @@ def main(): # If given, use whitelist for top cells if whitelist: - top_cells = whitelist + top_cells_set = whitelist # Add potential missing cell barcodes. for missing_cell in whitelist: if missing_cell in final_results: @@ -567,30 +567,34 @@ def main(): final_results[missing_cell] = dict() for TAG in ordered_tags_map: final_results[missing_cell][TAG] = Counter() - top_cells.add(missing_cell) + top_cells_set.add(missing_cell) else: # Select top cells based on total umis per cell top_cells_tuple = umis_per_cell.most_common(args.expected_cells) - top_cells = set([pair[0] for pair in top_cells_tuple]) + top_cells_set = set([pair[0] for pair in top_cells_tuple]) # UMI correction if args.no_umi_correction: # Don't correct umis_corrected = 0 - aberrant_cells = set() + aberrant_cells_set = set() else: # Correct UMIS - (final_results, umis_corrected, aberrant_cells) = processing.correct_umis( + (final_results, umis_corrected, aberrant_cells_set) = processing.correct_umis( final_results=final_results, collapsing_threshold=args.umi_threshold, - top_cells=top_cells, + top_cells=top_cells_set, max_umis=20000, ) # Remove aberrant cells from the top cells - for cell_barcode in aberrant_cells: - top_cells.remove(cell_barcode) + for cell_barcode in aberrant_cells_set: + top_cells_set.remove(cell_barcode) + + # Ensure cell order (required for pandas>=2.0.0) + top_cells = list(top_cells_set) + aberrant_cells = list(aberrant_cells_set) # Create sparse aberrant cells matrix (umi_aberrant_matrix, read_aberrant_matrix) = processing.generate_sparse_matrices( diff --git a/cite_seq_count/io.py b/cite_seq_count/io.py index 2dc04f0..c13bd88 100644 --- a/cite_seq_count/io.py +++ b/cite_seq_count/io.py @@ -12,7 +12,7 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol Args: sparse_matrix (dok_matrix): Results in a sparse matrix. - top_cells (set): Set of cells that are selected for output. + top_cells (list): Set of cells that are selected for output. ordered_tags_map (dict): Tags in order with indexes as values. data_type (string): A string definning if the data is umi or read based. outfolder (string): Path to the output folder. @@ -35,11 +35,11 @@ def write_to_files(sparse_matrix, top_cells, ordered_tags_map, data_type, outfol def write_dense(sparse_matrix, index, columns, outfolder, filename): """ Writes a dense matrix in a csv format - + Args: sparse_matrix (dok_matrix): Results in a sparse matrix. index (list): List of TAGS - columns (set): List of cells + columns (list): List of cells outfolder (str): Output folder filename (str): Filename """ @@ -59,7 +59,7 @@ def write_unmapped(merged_no_match, top_unknowns, outfolder, filename): outfolder (string): Path of the output folder filename (string): Name of the output file """ - + top_unmapped = merged_no_match.most_common(top_unknowns) with open(os.path.join(outfolder, filename),'w') as unknown_file: diff --git a/cite_seq_count/processing.py b/cite_seq_count/processing.py index 57b35b7..d498b40 100644 --- a/cite_seq_count/processing.py +++ b/cite_seq_count/processing.py @@ -105,7 +105,7 @@ def map_reads( Args: read1_path (string): Path to R1.fastq.gz read2_path (string): Path to R2.fastq.gz - chunk_size (int): The number of lines to process + chunk_size (int): The number of lines to process tags (dict): A dictionary with the TAGs + TAG Names. barcode_slice (slice): A slice for extracting the Barcode portion from the sequence. @@ -234,13 +234,13 @@ def merge_results(parallel_results): def correct_umis(final_results, collapsing_threshold, top_cells, max_umis): """ Corrects umi barcodes within same cell/tag groups. - + Args: final_results (dict): Dict of dict of Counters with mapping results. collapsing_threshold (int): Max distance between umis. top_cells (set): Set of cells to go through. max_umis (int): Maximum UMIs to consider for one cluster. - + Returns: final_results (dict): Same as input but with corrected umis. corrected_umis (int): How many umis have been corrected. @@ -339,14 +339,14 @@ def correct_cells( ): """ Corrects cell barcodes. - + Args: final_results (dict): Dict of dict of Counters with mapping results. umis_per_cell (Counter): Counter of number of umis per cell. collapsing_threshold (int): Max distance between umis. expected_cells (int): Number of expected cells. ab_map (dict): Dict of the TAGS. - + Returns: final_results (dict): Same as input but with corrected umis. umis_per_cell (Counter): Counter of umis per cell after cell barcode correction @@ -375,7 +375,7 @@ def correct_cells_whitelist( ): """ Corrects cell barcodes. - + Args: final_results (dict): Dict of dict of Counters with mapping results. umis_per_cell (Counter): Counter of UMIs per cell. @@ -383,7 +383,7 @@ def correct_cells_whitelist( collapsing_threshold (int): Max distance between umis. ab_map (OrederedDict): Tags in an ordered dict. - + Returns: final_results (dict): Same as input but with corrected umis. umis_per_cell (Counter): Updated UMI counts after correction. @@ -479,4 +479,3 @@ def generate_sparse_matrices(final_results, ordered_tags_map, top_cells): final_results[cell_barcode][TAG].values() ) return (umi_results_matrix, read_results_matrix) - diff --git a/setup.py b/setup.py index 6423bc3..bfb38e8 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ "scipy>=1.1.0", "multiprocess>=0.70.6.1", "umi_tools==1.0.0", - "pytest==4.1.0", + "pytest>=8.1.0", "pytest-dependency==0.4.0", "pandas>=0.23.4", "pybktree==1.1",