From dd26870a5f92a83c618f3d8e88dac187c3666a39 Mon Sep 17 00:00:00 2001 From: keejkrej Date: Mon, 25 May 2026 13:58:26 +0200 Subject: [PATCH 1/3] Add acdc-data CLI for BioIO metadata, convert, and restructure. Introduce a separate headless entry point with thin cli.py kernels so GUI-only modules like dataStruct stay untouched. Co-authored-by: Cursor --- cellacdc/_run.py | 44 + cellacdc/cli.py | 43 + cellacdc/data_cli.py | 1435 ++++++++++++++++++++++ cellacdc/docs/source/getting-started.rst | 56 + pyproject.toml | 1 + tests/test_data_cli.py | 150 +++ 6 files changed, 1729 insertions(+) create mode 100644 cellacdc/data_cli.py create mode 100644 tests/test_data_cli.py diff --git a/cellacdc/_run.py b/cellacdc/_run.py index 3e615cf9c..6f88b213d 100644 --- a/cellacdc/_run.py +++ b/cellacdc/_run.py @@ -619,6 +619,50 @@ def run_cli(ini_filepath): logger.info('**********************************************') logger.info(f'Cell-ACDC command-line closed. {myutils.get_salute_string()}') logger.info('**********************************************') + + +def run_data_metadata(parser_args): + from cellacdc import myutils, cli + + logger, logs_path, log_path, log_filename = myutils.setupLogger( + module='data_metadata', logs_path=None + ) + kernel = cli.DataMetadataKernel(logger, log_path, parser_args) + kernel.run() + + logger.info('**********************************************') + logger.info(f'Cell-ACDC metadata CLI closed. {myutils.get_salute_string()}') + logger.info('**********************************************') + + +def run_data_convert(parser_args): + from cellacdc import myutils, cli + + logger, logs_path, log_path, log_filename = myutils.setupLogger( + module='data_convert', logs_path=None + ) + kernel = cli.DataConvertKernel(logger, log_path, parser_args) + kernel.run() + + logger.info('**********************************************') + logger.info(f'Cell-ACDC data convert closed. {myutils.get_salute_string()}') + logger.info('**********************************************') + + +def run_data_restructure(parser_args): + from cellacdc import myutils, cli + + logger, logs_path, log_path, log_filename = myutils.setupLogger( + module='data_restructure', logs_path=None + ) + kernel = cli.DataRestructureKernel(logger, log_path, parser_args) + kernel.run() + + logger.info('**********************************************') + logger.info( + f'Cell-ACDC data restructure closed. {myutils.get_salute_string()}' + ) + logger.info('**********************************************') def _setup_numpy(caller_name='Cell-ACDC'): diff --git a/cellacdc/cli.py b/cellacdc/cli.py index 04b4e578f..7b8e955b5 100644 --- a/cellacdc/cli.py +++ b/cellacdc/cli.py @@ -1995,3 +1995,46 @@ def _add_derived_cell_cycle_columns(self, all_frames_acdc_df): self.sigLog.emit(traceback.format_exc()) return all_frames_acdc_df + + +class DataMetadataKernel(_WorkflowKernel): + def __init__(self, logger, log_path, parser_args): + super().__init__(logger, log_path, is_cli=True) + self.parser_args = parser_args + + @exception_handler_cli + def run(self): + from cellacdc import data_cli + + self.logger.info('Extracting metadata from raw microscopy file...') + data_cli.run_metadata_cli(self.parser_args) + + +class DataConvertKernel(_WorkflowKernel): + def __init__(self, logger, log_path, parser_args): + super().__init__(logger, log_path, is_cli=True) + self.parser_args = parser_args + + @exception_handler_cli + def run(self): + from cellacdc import data_cli + + self.logger.info('Starting BioIO data conversion...') + data_cli.run_convert_cli( + self.parser_args, logger_func=self.logger.info + ) + + +class DataRestructureKernel(_WorkflowKernel): + def __init__(self, logger, log_path, parser_args): + super().__init__(logger, log_path, is_cli=True) + self.parser_args = parser_args + + @exception_handler_cli + def run(self): + from cellacdc import data_cli + + self.logger.info('Starting data restructure...') + data_cli.run_restructure_cli( + self.parser_args, logger_func=self.logger.info + ) diff --git a/cellacdc/data_cli.py b/cellacdc/data_cli.py new file mode 100644 index 000000000..fb11a733f --- /dev/null +++ b/cellacdc/data_cli.py @@ -0,0 +1,1435 @@ +"""Headless CLI for ACDC data structure conversion and metadata extraction.""" + +import argparse +import json +import os +import re +import shutil +import subprocess +import sys +import uuid +from collections import Counter +from dataclasses import dataclass, field, asdict +from typing import Optional + +import numpy as np +import pandas as pd +from natsort import natsorted +from tqdm import tqdm + +from . import io, load, myutils + +LAYOUT_TO_RAW_DATA_STRUCT = { + 'single-multi-pos': 0, + 'one-per-pos': 1, + 'one-per-channel': 2, +} + +RAW_DATA_STRUCT_TO_LAYOUT = {v: k for k, v in LAYOUT_TO_RAW_DATA_STRUCT.items()} + +RESTRUCTURE_LAYOUTS = ( + 'multi-timepoint', + 'multi-channel', +) + +# Frame number must be at the end with .ext, e.g., _t01.tif +FRAME_NAME_PATTERNS = ( + r'_(day)?(\d+)\.[A-Za-z0-9]+$', + r'_(t)?(\d+)\.[A-Za-z0-9]+$', +) + + +def get_frame_num_and_pattern(filename): + matching_frame_name_pattern = r'^\.+' + frame_number = None + for frame_name_pattern in FRAME_NAME_PATTERNS: + try: + frame_number = re.findall(frame_name_pattern, filename)[0][1] + matching_frame_name_pattern = frame_name_pattern + break + except Exception: + frame_number = None + return matching_frame_name_pattern, frame_number + + +def read_filename_pattern(file_name): + matching_frame_name_pattern, frame_number = get_frame_num_and_pattern( + file_name + ) + s = re.sub(matching_frame_name_pattern, '', file_name) + for i, c in enumerate(s[::-1]): + if c == '_': + break + channel_name = s[-i:] + pos_name = s[:-i - 1] + if channel_name.endswith('.tif'): + channel_name = channel_name[:-4] + return pos_name, frame_number, channel_name + + +@dataclass +class HeadlessMetadataHandler: + trust_metadata: bool = True + to_h5: bool = False + lens_na: Optional[float] = None + size_t: Optional[int] = None + size_z: Optional[int] = None + size_c: Optional[int] = None + size_s: Optional[int] = None + time_increment: Optional[float] = None + physical_size_x: Optional[float] = None + physical_size_y: Optional[float] = None + physical_size_z: Optional[float] = None + channels: Optional[list] = None + em_wavelens: Optional[list] = None + positions: Optional[list] = None + time_range_start: int = 0 + time_range_end: Optional[int] = None + save_channels: Optional[list] = None + add_image_name: bool = False + basename: Optional[str] = None + + @classmethod + def from_parser_args(cls, args: dict) -> 'HeadlessMetadataHandler': + channels = None + if args.get('channels'): + channels = [c.strip() for c in args['channels'].split(',') if c.strip()] + + em_wavelens = None + if args.get('em_wavelens'): + em_wavelens = [ + float(w.strip()) for w in args['em_wavelens'].split(',') if w.strip() + ] + + positions = None + if args.get('positions'): + positions = [p.strip() for p in args['positions'].split(',') if p.strip()] + + save_channels = None + if args.get('save_channels'): + save_channels = [ + s.strip().lower() in ('1', 'true', 'yes') + for s in args['save_channels'].split(',') + ] + + time_range_start, time_range_end = parse_time_range(args.get('time_range')) + + handler = cls( + trust_metadata=args.get('trust_metadata', True), + to_h5=args.get('format', 'tif') == 'h5', + lens_na=args.get('lens_na'), + size_t=args.get('size_t'), + size_z=args.get('size_z'), + size_c=args.get('size_c'), + size_s=args.get('size_s'), + time_increment=args.get('time_increment'), + physical_size_x=args.get('physical_size_x'), + physical_size_y=args.get('physical_size_y'), + physical_size_z=args.get('physical_size_z'), + channels=channels, + em_wavelens=em_wavelens, + positions=positions, + time_range_start=time_range_start, + time_range_end=time_range_end, + save_channels=save_channels, + add_image_name=args.get('add_image_name', False), + basename=args.get('basename'), + ) + + metadata_csv = args.get('metadata_csv') + if metadata_csv: + handler.apply_metadata_csv(metadata_csv) + + return handler + + def apply_metadata_csv(self, metadata_csv_path: str): + df = pd.read_csv(metadata_csv_path).set_index('Description') + field_map = { + 'LensNA': ('lens_na', float), + 'SizeT': ('size_t', int), + 'SizeZ': ('size_z', int), + 'TimeIncrement': ('time_increment', float), + 'PhysicalSizeX': ('physical_size_x', float), + 'PhysicalSizeY': ('physical_size_y', float), + 'PhysicalSizeZ': ('physical_size_z', float), + 'basename': ('basename', str), + } + for csv_key, (attr, cast) in field_map.items(): + if csv_key in df.index: + setattr(self, attr, cast(df.at[csv_key, 'values'])) + + ch_names = [] + em_wavelens = [] + c = 0 + while f'channel_{c}_name' in df.index: + ch_names.append(str(df.at[f'channel_{c}_name', 'values'])) + wavelen_key = f'channel_{c}_emWavelen' + if wavelen_key in df.index: + em_wavelens.append(float(df.at[wavelen_key, 'values'])) + c += 1 + if ch_names: + self.channels = ch_names + if em_wavelens: + self.em_wavelens = em_wavelens + + +@dataclass +class ParsedRawMetadata: + lens_na: float = 1.4 + size_t: int = 1 + size_z: int = 1 + size_c: int = 1 + size_s: int = 1 + time_increment: float = 1.0 + time_increment_unit: str = 's' + physical_size_x: float = 1.0 + physical_size_y: float = 1.0 + physical_size_z: float = 1.0 + physical_size_unit: str = 'μm' + channel_names: list = field(default_factory=list) + em_wavelens: list = field(default_factory=list) + image_name: str = '' + metadata_xml: str = '' + + +def parse_time_range(time_range: Optional[str]) -> tuple: + if not time_range: + return 0, None + if ':' in time_range: + start, end = time_range.split(':', 1) + return int(start), int(end) + return 0, int(time_range) + + +def layout_to_raw_data_struct(layout: str) -> int: + try: + return LAYOUT_TO_RAW_DATA_STRUCT[layout] + except KeyError as err: + valid = ', '.join(LAYOUT_TO_RAW_DATA_STRUCT) + raise ValueError( + f'Invalid layout "{layout}". Valid values: {valid}' + ) from err + + +def on_existing_to_worker_flags(on_existing: str) -> dict: + if on_existing == 'overwrite': + return {'overwrite': True, 'add_files': False, 'create_new': False} + if on_existing == 'add': + return {'overwrite': False, 'add_files': True, 'create_new': False} + if on_existing == 'create-new': + return {'overwrite': False, 'add_files': False, 'create_new': True} + raise ValueError( + f'Invalid on-existing policy "{on_existing}". ' + 'Valid values: overwrite, add, create-new' + ) + + +def list_raw_microscopy_files(raw_src_path: str, layout: str) -> list: + ls = natsorted(myutils.listdir(raw_src_path)) + files = [ + filename for filename in ls + if os.path.isfile(os.path.join(raw_src_path, filename)) + ] + if not files: + raise FileNotFoundError( + f'No files found in input folder "{raw_src_path}"' + ) + + extensions = [ + os.path.splitext(filename)[1] for filename in files + ] + unique_ext = list(dict.fromkeys(extensions)) + if len(unique_ext) > 1: + from collections import Counter + most_common_ext, _ = Counter(extensions).most_common(1)[0] + files = [ + filename for filename in files + if os.path.splitext(filename)[1] == most_common_ext + ] + + if layout == 'single-multi-pos' and len(files) > 1: + raise ValueError( + 'Layout "single-multi-pos" expects a single microscopy file in the ' + f'input folder, but found {len(files)} files: {files}' + ) + + return files + + +def parse_one_per_channel_files(raw_filenames: list) -> tuple: + ch_names = set() + pos_nums = set() + stripped_filenames = [] + for file in raw_filenames: + filename, _ = os.path.splitext(file) + m_iter = myutils.findalliter(r'(\d+)_(.+)', filename) + if len(m_iter) <= 1: + raise ValueError( + 'Files for layout "one-per-channel" must match the pattern ' + 'basenameN_channelName (e.g. ASY015_1_GFP). ' + f'Could not parse filename "{file}".' + ) + m = m_iter[-2] + pos_num, ch_name = int(m[0][0]), m[0][1] + ch_names.add(ch_name) + pos_nums.add(pos_num) + ch_idx = filename.find(f'{pos_num}_{ch_name}') + stripped_filenames.append(filename[:ch_idx]) + + basename = myutils.getBasename(stripped_filenames) + if not basename: + raise ValueError( + 'Could not determine common basename from one-per-channel filenames.' + ) + + return basename, sorted(pos_nums), sorted(ch_names, key=str) + + +def read_metadata_bioio(raw_filepath: str): + from . import bioio_sample_data_folderpath, _process + from . import acdc_bioio_bioformats as bioformats + import subprocess + + read_metadata_py_filepath = os.path.join( + os.path.dirname(bioformats.__file__), '_read_metadata.py' + ) + uuid4 = uuid.uuid4() + command = ( + f'{sys.executable}, {read_metadata_py_filepath}, ' + f'-f, {raw_filepath}, ' + f'-uuid, {uuid4}' + ) + args = [sys.executable, _process.__file__, '-c', command] + subprocess.run(args, check=False) + bioformats._utils.check_raise_exception(uuid4) + + metadataXML_filepath = os.path.join( + bioio_sample_data_folderpath, 'metadataXML.txt' + ) + metadataXML = bioformats.Metadata().init_from_file(metadataXML_filepath) + + metadata_filepath = os.path.join( + bioio_sample_data_folderpath, 'metadata.txt' + ) + metadata = bioformats.OMEXML().init_from_file( + metadata_filepath, raw_filepath + ) + return metadata, metadataXML + + +def parse_raw_metadata(raw_filepath: str) -> ParsedRawMetadata: + from . import load + + if raw_filepath.endswith('.ome.tif'): + metadata = load.OMEXML(raw_filepath) + metadata_xml = metadata.omexml_string + else: + metadata, metadata_xml_obj = read_metadata_bioio(raw_filepath) + metadata_xml = str(metadata_xml_obj) + + parsed = ParsedRawMetadata(metadata_xml=metadata_xml) + + try: + parsed.lens_na = float(metadata.instrument().Objective.LensNA) + except Exception: + pass + + try: + parsed.size_s = int(metadata.get_image_count()) + except Exception: + pass + + try: + parsed.size_z = int(metadata.image().Pixels.SizeZ) + except Exception: + pass + + try: + parsed.size_t = int(metadata.image().Pixels.SizeT) + except Exception: + pass + + try: + parsed.time_increment = float(metadata.image().Pixels.node.get('TimeIncrement')) + except Exception: + pass + + try: + unit = metadata.image().Pixels.node.get('TimeIncrementUnit') + if unit is not None: + parsed.time_increment_unit = unit + except Exception: + pass + + try: + parsed.size_c = int(metadata.image().Pixels.SizeC) + except Exception: + pass + + try: + parsed.physical_size_x = float(metadata.image().Pixels.PhysicalSizeX) + except Exception: + pass + + try: + parsed.physical_size_y = float(metadata.image().Pixels.PhysicalSizeY) + except Exception: + pass + + try: + parsed.physical_size_z = float(metadata.image().Pixels.PhysicalSizeZ) + except Exception: + pass + + try: + unit = metadata.image().Pixels.node.get('PhysicalSizeXUnit') + if unit is not None: + parsed.physical_size_unit = unit + except Exception: + pass + + try: + image_name = metadata.image().Name + if image_name is not None: + parsed.image_name = image_name + except Exception: + pass + + ch_names = [] + em_wavelens = [] + for c in range(parsed.size_c): + try: + ch_names.append(metadata.image().Pixels.Channel(c).Name or f'channel_{c}') + except Exception: + ch_names.append(f'channel_{c}') + try: + em_wavelen = metadata.image().Pixels.Channel(c).node.get('EmissionWavelength') + em_wavelens.append(float(em_wavelen)) + except Exception: + em_wavelens.append(500.0) + + parsed.channel_names = ch_names + parsed.em_wavelens = em_wavelens + return parsed + + +def metadata_to_dataframe( + parsed: ParsedRawMetadata, + basename: str, + ) -> pd.DataFrame: + df = pd.DataFrame({ + 'LensNA': parsed.lens_na, + 'SizeT': parsed.size_t, + 'SizeZ': parsed.size_z, + 'TimeIncrement': parsed.time_increment, + 'PhysicalSizeZ': parsed.physical_size_z, + 'PhysicalSizeY': parsed.physical_size_y, + 'PhysicalSizeX': parsed.physical_size_x, + 'basename': basename, + }, index=['values']).T + df.index.name = 'Description' + + ch_metadata = list(parsed.channel_names) + ch_metadata.extend(parsed.em_wavelens) + description = [f'channel_{c}_name' for c in range(len(parsed.channel_names))] + description.extend([ + f'channel_{c}_emWavelen' for c in range(len(parsed.channel_names)) + ]) + df_channel_names = pd.DataFrame({ + 'Description': description, + 'values': ch_metadata, + }).set_index('Description') + return pd.concat([df, df_channel_names]) + + +def guess_basename_from_filepath(raw_filepath: str) -> str: + filename = os.path.splitext(os.path.basename(raw_filepath))[0] + return f'{filename}_' + + +def init_bioio_reader(raw_filepath: str, logger_func=print): + from . import acdc_bioio_bioformats as bioformats + import subprocess + from . import _process + + bioformats.install.install_reader_dependencies( + raw_filepath, + exception=Exception( + 'Failed installing reader dependencies from the CLI.' + ), + ) + + init_reader_py_filepath = os.path.join( + os.path.dirname(bioformats.__file__), '_init_reader.py' + ) + uuid4 = uuid.uuid4() + command = ( + f'{sys.executable}, {init_reader_py_filepath}, ' + f'-f, {raw_filepath}, ' + f'-uuid, {uuid4}' + ) + args = [sys.executable, _process.__file__, '-c', command] + subprocess.run(args, check=False) + bioformats._utils.check_raise_exception(uuid4) + logger_func('BioIO reader initialized.') + + +def get_start_pos_n(exp_dst_path: str, on_existing: str) -> int: + if on_existing != 'create-new': + return 1 + pos_foldernames = myutils.get_pos_foldernames(exp_dst_path) + if not pos_foldernames: + return 1 + pos_ns = [int(pos.split('_')[-1]) for pos in pos_foldernames] + return max(pos_ns) + 1 + + +def run_metadata_cli(args: dict): + input_path = os.path.abspath(args['input']) + if not os.path.exists(input_path): + raise FileNotFoundError(f'Input path does not exist: "{input_path}"') + + if os.path.isdir(input_path): + files = list_raw_microscopy_files(input_path, 'one-per-pos') + raw_filepath = os.path.join(input_path, files[0]) + else: + raw_filepath = input_path + + parsed = parse_raw_metadata(raw_filepath) + basename = args.get('basename') or guess_basename_from_filepath(raw_filepath) + df = metadata_to_dataframe(parsed, basename) + + output_format = args.get('format', 'text') + if output_format == 'json': + payload = asdict(parsed) + payload['basename'] = basename + print(json.dumps(payload, indent=2)) + return + + if output_format == 'csv': + print(df.to_csv()) + return + + print(f'File: {raw_filepath}') + print(f'basename: {basename}') + for idx, row in df.iterrows(): + print(f'{idx}: {row["values"]}') + + output_dir = args.get('output') + if output_dir: + os.makedirs(output_dir, exist_ok=True) + metadata_csv_path = os.path.join(output_dir, f'{basename}metadata.csv') + df.to_csv(metadata_csv_path) + metadata_xml_path = os.path.join(output_dir, f'{basename}metadataXML.txt') + with open(metadata_xml_path, 'w', encoding='utf-8') as txt: + txt.write(parsed.metadata_xml) + print(f'Wrote metadata to "{metadata_csv_path}"') + + +@dataclass +class ConvertState: + lens_na: float + size_t: int + size_z: int + size_c: int + size_s: int + time_increment: float + physical_size_x: float + physical_size_y: float + physical_size_z: float + channel_names: list + em_wavelens: list + metadata_xml: str + to_h5: bool = False + selected_pos: list = field(default_factory=lambda: ['All Positions']) + time_range_start: int = 0 + time_range_end: Optional[int] = None + save_channels: Optional[list] = None + add_image_name: bool = False + image_name: str = '' + + +def build_convert_state( + parsed: ParsedRawMetadata, + handler: HeadlessMetadataHandler, + ) -> ConvertState: + lens_na = handler.lens_na if handler.lens_na is not None else parsed.lens_na + size_t = handler.size_t if handler.size_t is not None else parsed.size_t + size_z = handler.size_z if handler.size_z is not None else parsed.size_z + size_c = handler.size_c if handler.size_c is not None else parsed.size_c + size_s = handler.size_s if handler.size_s is not None else parsed.size_s + time_increment = ( + handler.time_increment if handler.time_increment is not None + else parsed.time_increment + ) + physical_size_x = ( + handler.physical_size_x if handler.physical_size_x is not None + else parsed.physical_size_x + ) + physical_size_y = ( + handler.physical_size_y if handler.physical_size_y is not None + else parsed.physical_size_y + ) + physical_size_z = ( + handler.physical_size_z if handler.physical_size_z is not None + else parsed.physical_size_z + ) + channel_names = handler.channels if handler.channels is not None else parsed.channel_names + size_c = len(channel_names) + em_wavelens = ( + handler.em_wavelens if handler.em_wavelens is not None + else parsed.em_wavelens[:size_c] + ) + return ConvertState( + lens_na=lens_na, + size_t=size_t, + size_z=size_z, + size_c=size_c, + size_s=size_s, + time_increment=time_increment, + physical_size_x=physical_size_x, + physical_size_y=physical_size_y, + physical_size_z=physical_size_z, + channel_names=channel_names, + em_wavelens=em_wavelens, + metadata_xml=parsed.metadata_xml, + to_h5=handler.to_h5, + selected_pos=( + handler.positions if handler.positions is not None + else ['All Positions'] + ), + time_range_start=handler.time_range_start, + time_range_end=( + handler.time_range_end if handler.time_range_end is not None + else size_t - 1 + ), + save_channels=handler.save_channels, + add_image_name=handler.add_image_name, + image_name=parsed.image_name, + ) + + +def _sanitize_image_name(image_name: str) -> str: + ch_name = "".join( + c if c.isalnum() or c == '_' or c == '' else '_' for c in image_name + ) + while ch_name.endswith('_'): + ch_name = ch_name[:-1] + return ch_name + + +def get_acdc_filename( + filename_no_ext: str, + s0p: str, + append_txt: str, + ext: str, + add_image_name: bool = False, + image_name: str = '', + return_basename: bool = False, + ): + filename_no_ext = filename_no_ext.replace('.', '_') + if add_image_name and image_name: + image_name = _sanitize_image_name(image_name) + basename = f'{filename_no_ext}_{image_name}_s{s0p}_' + else: + basename = f'{filename_no_ext}_s{s0p}_' + filename = f'{basename}{append_txt}{ext}' + if return_basename: + return filename, basename + return filename + + +def _write_position_metadata( + state: ConvertState, + images_path: str, + filename_no_ext: str, + s0p: str, + series: int, + ) -> str: + metadata_xml_path = os.path.join( + images_path, + get_acdc_filename( + filename_no_ext, s0p, 'metadataXML', '.txt', + state.add_image_name, state.image_name, + ), + ) + with open(metadata_xml_path, 'w', encoding='utf-8') as txt: + txt.write(state.metadata_xml) + + metadata_filename, basename = get_acdc_filename( + filename_no_ext, s0p, 'metadata', '.csv', + state.add_image_name, state.image_name, + return_basename=True, + ) + metadata_csv_path = os.path.join(images_path, metadata_filename) + saved_size_t = state.time_range_end - state.time_range_start + 1 + save_channels = state.save_channels or [True] * state.size_c + df = pd.DataFrame({ + 'LensNA': state.lens_na, + 'SizeT': saved_size_t, + 'SizeZ': state.size_z, + 'TimeIncrement': state.time_increment, + 'PhysicalSizeZ': state.physical_size_z, + 'PhysicalSizeY': state.physical_size_y, + 'PhysicalSizeX': state.physical_size_x, + 'basename': basename, + }, index=['values']).T + df.index.name = 'Description' + + ch_metadata = [ + ch_name for c, ch_name in enumerate(state.channel_names) + if save_channels[c] + ] + description = [ + f'channel_{c}_name' for c in range(state.size_c) if save_channels[c] + ] + ch_metadata.extend([ + wavelen for c, wavelen in enumerate(state.em_wavelens) + if save_channels[c] + ]) + description.extend([ + f'channel_{c}_emWavelen' for c in range(state.size_c) + if save_channels[c] + ]) + df = pd.concat([ + df, + pd.DataFrame({ + 'Description': description, + 'values': ch_metadata, + }).set_index('Description'), + ]) + df.to_csv(metadata_csv_path) + return basename + + +def _run_bioio_subprocess(command: str, uuid4): + from . import _process + from . import acdc_bioio_bioformats as bioformats + + args = [sys.executable, _process.__file__, '-c', command] + subprocess.run(args, check=False) + bioformats._utils.check_raise_exception(uuid4) + + +def _save_channels_bioio( + state: ConvertState, + raw_filepath: str, + images_path: str, + filename_no_ext: str, + s0p: str, + series: int, + lazy_load: bool, + logger_func=print, + ): + from . import acdc_bioio_bioformats as bioformats + + save_data_py_filepath = os.path.join( + os.path.dirname(bioformats.__file__), '_save_data.py' + ) + save_channels = state.save_channels or [True] * state.size_c + zyx_physical_sizes = " ".join([ + str(state.physical_size_z), + str(state.physical_size_y), + str(state.physical_size_x), + ]) + uuid4 = uuid.uuid4() + command = ( + f'{sys.executable}, {save_data_py_filepath}, ' + f'-f, {raw_filepath}, ' + f'-d, {" ".join([str(val) for val in save_channels])}, ' + f'-c, {" ".join(state.channel_names)}, ' + f'-s, {series}, ' + f'-i, {images_path}, ' + f'-p, {filename_no_ext}, ' + f'-pos, {s0p}, ' + f'-t, {state.size_t}, ' + f'-z, {state.size_z}, ' + f'-time_increment, {state.time_increment}, ' + f'-zyx, {zyx_physical_sizes}, ' + f'-r, {state.time_range_start} {state.time_range_end}, ' + f'-uuid, {uuid4}' + ) + if state.to_h5: + command = f'{command}, -to_h5' + if not lazy_load: + command = f'{command}, -a' + logger_func( + f'Saving channels via BioIO for series {series} to {images_path}...' + ) + _run_bioio_subprocess(command, uuid4) + + +def _save_single_channel_bioio( + state: ConvertState, + raw_filepath: str, + images_path: str, + filename_no_ext: str, + s0p: str, + series: int, + ch_name: str, + ch_idx: int, + lazy_load: bool, + logger_func=print, + ): + from . import acdc_bioio_bioformats as bioformats + + save_data_py_filepath = os.path.join( + os.path.dirname(bioformats.__file__), '_save_data_single_channel.py' + ) + save_channels = state.save_channels or [True] * state.size_c + zyx_physical_sizes = " ".join([ + str(state.physical_size_z), + str(state.physical_size_y), + str(state.physical_size_x), + ]) + uuid4 = uuid.uuid4() + command = ( + f'{sys.executable}, {save_data_py_filepath}, ' + f'-f, {raw_filepath}, ' + f'-d, {" ".join([str(val) for val in save_channels])}, ' + f'-c, {ch_name}, ' + f'-ch_idx, {ch_idx}, ' + f'-s, {series}, ' + f'-i, {images_path}, ' + f'-p, {filename_no_ext}, ' + f'-pos, {s0p}, ' + f'-t, {state.size_t}, ' + f'-z, {state.size_z}, ' + f'-time_increment, {state.time_increment}, ' + f'-zyx, {zyx_physical_sizes}, ' + f'-r, {state.time_range_start} {state.time_range_end}, ' + f'-uuid, {uuid4}' + ) + if state.to_h5: + command = f'{command}, -to_h5' + if not lazy_load: + command = f'{command}, -a' + logger_func(f'Saving channel {ch_name} via BioIO...') + _run_bioio_subprocess(command, uuid4) + + +def _should_save_position(state: ConvertState, in_file_pos_idx: int) -> bool: + in_file_pos_name = f'Position_{in_file_pos_idx + 1}' + return ( + 'All Positions' in state.selected_pos + or in_file_pos_name in state.selected_pos + ) + + +def _save_to_pos_folder( + state: ConvertState, + raw_src_path: str, + exp_dst_path: str, + filename: str, + series: int, + pos_n: int, + num_pos_digits: int, + raw_data_struct: int, + overwrite_pos: bool, + create_new: bool, + lazy_load: bool, + logger_func=print, + basename_for_channels: Optional[str] = None, + ): + raw_filepath = os.path.join(raw_src_path, filename) + if not _should_save_position(state, series): + return + + pos_path = os.path.join(exp_dst_path, f'Position_{pos_n}') + images_path = os.path.join(pos_path, 'Images') + + if os.path.exists(images_path) and overwrite_pos: + shutil.rmtree(images_path) + + if os.path.exists(images_path) and create_new: + images_path = re.sub( + r'Position_\d+', f'Position_{pos_n}', images_path + ) + + os.makedirs(images_path, exist_ok=True) + s0p = str(pos_n).zfill(num_pos_digits) + filename_no_ext, _ = os.path.splitext(filename) + + logger_func( + f'Position {pos_n}: saving data to {images_path}...' + ) + _write_position_metadata( + state, images_path, filename_no_ext, s0p, series + ) + + if raw_data_struct != 2: + _save_channels_bioio( + state, raw_filepath, images_path, filename_no_ext, s0p, + series, lazy_load, logger_func=logger_func, + ) + else: + save_channels = state.save_channels or [True] * state.size_c + channel_basename = basename_for_channels or filename_no_ext + for c, (ch_name, save_ch) in enumerate( + zip(state.channel_names, save_channels) + ): + if not save_ch: + continue + raw_filename = f'{channel_basename}{pos_n}_{ch_name}' + channel_raw_filepath = next( + os.path.join(raw_src_path, f) + for f in myutils.listdir(raw_src_path) + if f.find(raw_filename) != -1 + ) + _save_single_channel_bioio( + state, channel_raw_filepath, images_path, filename_no_ext, + s0p, series, ch_name, c, lazy_load, logger_func=logger_func, + ) + + +def _move_raw_file(raw_src_path: str, filename: str, move_raw: bool): + if not move_raw: + return + if os.path.basename(raw_src_path) == 'raw_microscopy_files': + return + raw_filepath = os.path.join(raw_src_path, filename) + raw_path = os.path.join(raw_src_path, 'raw_microscopy_files') + os.makedirs(raw_path, exist_ok=True) + dst = os.path.join(raw_path, filename) + try: + shutil.move(raw_filepath, dst) + except PermissionError as err: + print(err) + + +def run_convert_cli(args: dict, logger_func=print): + raw_src_path = os.path.abspath(args['input']) + exp_dst_path = os.path.abspath(args['output']) + layout = args['layout'] + raw_data_struct = layout_to_raw_data_struct(layout) + + if not os.path.isdir(raw_src_path): + raise NotADirectoryError( + f'Input path must be a folder containing raw microscopy files: ' + f'"{raw_src_path}"' + ) + + os.makedirs(exp_dst_path, exist_ok=True) + raw_filenames = list_raw_microscopy_files(raw_src_path, layout) + logger_func( + f'Found {len(raw_filenames)} raw file(s) in "{raw_src_path}"' + ) + + on_existing = args.get('on_existing', 'overwrite') + worker_flags = on_existing_to_worker_flags(on_existing) + start_pos_n = get_start_pos_n(exp_dst_path, on_existing) + metadata_handler = HeadlessMetadataHandler.from_parser_args(args) + lazy_load = args.get('lazy_load', True) + move_raw = args.get('move_raw', True) + if exp_dst_path == raw_src_path and not move_raw: + logger_func( + 'Input and output are the same folder; enabling --move-raw.' + ) + move_raw = True + + raw_filepath = os.path.join(raw_src_path, raw_filenames[0]) + init_bioio_reader(raw_filepath, logger_func=logger_func) + + overwrite_pos = worker_flags['overwrite'] + create_new = worker_flags['create_new'] + + channel_basename = metadata_handler.basename + if raw_data_struct == 2: + if channel_basename is None: + channel_basename, pos_nums, channel_names = ( + parse_one_per_channel_files(raw_filenames) + ) + else: + _, pos_nums, channel_names = parse_one_per_channel_files( + raw_filenames + ) + metadata_handler.channels = metadata_handler.channels or channel_names + + for p, filename in enumerate(raw_filenames): + pos_n = p + start_pos_n + parsed = parse_raw_metadata(os.path.join(raw_src_path, filename)) + state = build_convert_state(parsed, metadata_handler) + + if raw_data_struct == 0: + num_pos = state.size_s + num_pos_digits = len(str(num_pos)) + for in_file_p in range(state.size_s): + _save_to_pos_folder( + state, raw_src_path, exp_dst_path, filename, + in_file_p, pos_n, num_pos_digits, raw_data_struct, + overwrite_pos, create_new, lazy_load, + logger_func=logger_func, + ) + elif raw_data_struct == 1: + num_pos = len(raw_filenames) + num_pos_digits = len(str(num_pos)) + _save_to_pos_folder( + state, raw_src_path, exp_dst_path, filename, + 0, pos_n, num_pos_digits, raw_data_struct, + overwrite_pos, create_new, lazy_load, + logger_func=logger_func, + ) + else: + break + + _move_raw_file(raw_src_path, filename, move_raw) + + if raw_data_struct == 2: + parsed = parse_raw_metadata( + os.path.join(raw_src_path, raw_filenames[0]) + ) + state = build_convert_state(parsed, metadata_handler) + num_pos = len(pos_nums) + num_pos_digits = len(str(num_pos)) + for p_idx, pos in enumerate(pos_nums): + _save_to_pos_folder( + state, raw_src_path, exp_dst_path, channel_basename, + 0, pos, num_pos_digits, raw_data_struct, + overwrite_pos, create_new, lazy_load, + logger_func=logger_func, + basename_for_channels=channel_basename, + ) + for filename in raw_filenames: + _move_raw_file(raw_src_path, filename, move_raw) + + logger_func(f'Conversion completed. Output saved to "{exp_dst_path}".') + + +def list_restructure_files(folder_path: str) -> list: + ls = natsorted(myutils.listdir(folder_path)) + files = [ + filename for filename in ls + if os.path.isfile(os.path.join(folder_path, filename)) + ] + if not files: + raise FileNotFoundError( + f'No files found in input folder "{folder_path}"' + ) + + extensions = [os.path.splitext(filename)[1] for filename in files] + unique_ext = list(dict.fromkeys(extensions)) + if len(unique_ext) > 1: + most_common_ext, _ = Counter(extensions).most_common(1)[0] + files = [ + filename for filename in files + if os.path.splitext(filename)[1] == most_common_ext + ] + return files + + +def restructure_multi_channel( + src_path: str, + dst_path: str, + action: str = 'copy', + logger_func=print, + ): + if action not in ('copy', 'move'): + raise ValueError('action must be "copy" or "move"') + load._restructure_multi_files_multi_pos( + src_path, dst_path, action=action, signals=None, logger=logger_func + ) + + +def restructure_multi_timepoint( + src_path: str, + dst_path: str, + channels: list, + basename: str = '', + segm_folder: str = '', + logger_func=print, + ): + if not channels: + raise ValueError('At least one channel name is required.') + + valid_filenames = list_restructure_files(src_path) + sample_filename = valid_filenames[0] + frame_name_pattern, _ = get_frame_num_and_pattern(sample_filename) + + files_info = {} + for file in valid_filenames: + try: + for ch in channels: + match = re.findall(rf'(.*)_{re.escape(ch)}{frame_name_pattern}', file) + if match: + break + else: + raise FileNotFoundError( + f'The file name "{file}" does not contain any channel name' + ) + pos_name, _, frame_name = match[0] + frame_number = int(frame_name) + if pos_name not in files_info: + files_info[pos_name] = {ch: [(file, frame_number)]} + elif ch not in files_info[pos_name]: + files_info[pos_name][ch] = [(file, frame_number)] + else: + files_info[pos_name][ch].append((file, frame_number)) + except Exception: + logger_func( + f'WARNING: File "{file}" does not contain a valid pattern. ' + 'Skipping it.' + ) + + all_pos_data_info = [] + for p, (pos_name, channel_info) in enumerate(files_info.items()): + logger_func('=' * 40) + logger_func(f'Processing position "{pos_name}"...') + + img = None + for files_list in channel_info.values(): + file_path = os.path.join(src_path, files_list[0][0]) + try: + img = load.imread(file_path) + break + except Exception: + continue + if img is None: + logger_func( + f'WARNING: No valid image files found for position "{pos_name}"' + ) + continue + + if basename: + pos_basename = f'{basename}_{pos_name}_' + else: + pos_basename = f'{pos_name}_' + + first_files_list = next(iter(channel_info.values())) + size_t = len(first_files_list) + df_metadata = pd.DataFrame({ + 'SizeT': size_t, + 'basename': pos_basename, + }, index=['values']) + + for c, (channel_name, files_list) in enumerate(channel_info.items()): + logger_func(f' Processing channel "{channel_name}"...') + sorted_files_list = sorted(files_list, key=lambda t: t[1]) + df_metadata[f'channel_{c}_name'] = [channel_name] + + images_path = os.path.join(dst_path, f'Position_{p + 1}', 'Images') + os.makedirs(images_path, exist_ok=True) + + video_data = None + src_segm_paths = [''] * size_t + frame_numbers = [] + size_z = 1 + for frame_i, file_info in enumerate(sorted_files_list): + file, _ = file_info + src_img_file_path = os.path.join(src_path, file) + try: + img = load.imread(src_img_file_path) + if video_data is None: + video_data = np.zeros((size_t, *img.shape), dtype=img.dtype) + video_data[frame_i] = img + frame_number_match = re.findall(frame_name_pattern, file)[0][1] + frame_numbers.append(int(frame_number_match)) + except Exception: + continue + + if segm_folder and c == 0: + src_segm_paths[frame_i] = os.path.join(segm_folder, file) + + if img.ndim == 3: + size_z = len(img) + df_metadata['SizeZ'] = [size_z] + + if video_data is None: + logger_func( + f'WARNING: No valid image files found for position ' + f'"{pos_name}", channel "{channel_name}"' + ) + continue + + img_file_name = f'{pos_basename}{channel_name}.tif' + dst_img_file_path = os.path.join(images_path, img_file_name) + dst_segm_file_name = f'{pos_basename}segm_{channel_name}.npz' + dst_segm_path = os.path.join(images_path, dst_segm_file_name) + all_pos_data_info.append({ + 'path': dst_img_file_path, + 'SizeT': size_t, + 'SizeZ': size_z, + 'data': video_data, + 'frameNumbers': frame_numbers, + 'dst_segm_path': dst_segm_path, + 'src_segm_paths': src_segm_paths, + }) + + metadata_csv_path = os.path.join( + images_path, f'{pos_basename}metadata.csv' + ) + df_metadata = df_metadata.T + df_metadata.index.name = 'Description' + df_metadata.to_csv(metadata_csv_path) + logger_func('*' * 40) + + if not all_pos_data_info: + raise RuntimeError('No valid image files found to restructure.') + + logger_func('Saving image files...') + max_size_t = max(d['SizeT'] for d in all_pos_data_info) + min_frame_number = min(d['frameNumbers'][0] for d in all_pos_data_info) + for img_data_info in all_pos_data_info: + video_data = img_data_info['data'] + frame_numbers = img_data_info['frameNumbers'] + padded_shape = (max_size_t, *video_data.shape[1:]) + padded_video_data = np.zeros(padded_shape, dtype=video_data.dtype) + for frame_number, img in zip(frame_numbers, video_data): + frame_i = frame_number - min_frame_number + padded_video_data[frame_i] = img + img_data_info['paddedShape'] = padded_shape + img_data_info['data'] = None + myutils.to_tiff(img_data_info['path'], padded_video_data) + + if not segm_folder: + logger_func(f'Restructure completed. Output saved to "{dst_path}".') + return + + logger_func('Saving segmentation files...') + for img_data_info in all_pos_data_info: + padded_shape = img_data_info['paddedShape'] + segm_data = np.zeros(padded_shape, dtype=np.uint32) + for frame_number, segm_file_path in zip( + img_data_info['frameNumbers'], img_data_info['src_segm_paths'] + ): + frame_i = frame_number - min_frame_number + try: + lab = load.imread(segm_file_path).astype(np.uint32) + segm_data[frame_i] = lab + except Exception: + logger_func( + 'WARNING: Segmentation file does not exist, saving empty ' + f'masks: "{segm_file_path}"' + ) + io.savez_compressed(img_data_info['dst_segm_path'], segm_data) + + logger_func(f'Restructure completed. Output saved to "{dst_path}".') + + +def run_restructure_cli(args: dict, logger_func=print): + src_path = os.path.abspath(args['input']) + dst_path = os.path.abspath(args['output']) + layout = args['layout'] + + if not os.path.isdir(src_path): + raise NotADirectoryError( + f'Input path must be a folder containing image files: "{src_path}"' + ) + os.makedirs(dst_path, exist_ok=True) + + if layout == 'multi-channel': + restructure_multi_channel( + src_path, dst_path, + action=args.get('action', 'copy'), + logger_func=logger_func, + ) + elif layout == 'multi-timepoint': + channels_arg = args.get('channels') + if not channels_arg: + raise ValueError( + '--channels is required for layout "multi-timepoint".' + ) + channels = [ + ch.strip() for ch in channels_arg.split(',') if ch.strip() + ] + restructure_multi_timepoint( + src_path, dst_path, + channels=channels, + basename=args.get('basename') or '', + segm_folder=args.get('segm_folder') or '', + logger_func=logger_func, + ) + else: + valid = ', '.join(RESTRUCTURE_LAYOUTS) + raise ValueError( + f'Invalid restructure layout "{layout}". Valid values: {valid}' + ) + + +def _add_metadata_override_args(parser): + parser.add_argument( + '--trust-metadata', action=argparse.BooleanOptionalAction, + default=True, + help='Trust metadata read from the file (default: true for CLI).', + ) + parser.add_argument('--lens-na', type=float, default=None) + parser.add_argument('--size-t', type=int, default=None) + parser.add_argument('--size-z', type=int, default=None) + parser.add_argument('--size-c', type=int, default=None) + parser.add_argument('--size-s', type=int, default=None) + parser.add_argument('--time-increment', type=float, default=None) + parser.add_argument('--physical-size-x', type=float, default=None) + parser.add_argument('--physical-size-y', type=float, default=None) + parser.add_argument('--physical-size-z', type=float, default=None) + parser.add_argument( + '--channels', type=str, default=None, + help='Comma-separated channel names.', + ) + parser.add_argument( + '--em-wavelens', type=str, default=None, + help='Comma-separated emission wavelengths.', + ) + parser.add_argument( + '--positions', type=str, default=None, + help='Comma-separated positions to save (e.g. Position_1,Position_3). ' + 'Default: all positions.', + ) + parser.add_argument( + '--time-range', type=str, default=None, + help='Time range to save as start:end (0-indexed, inclusive).', + ) + parser.add_argument( + '--save-channels', type=str, default=None, + help='Comma-separated booleans for channels to save (e.g. true,false,true).', + ) + parser.add_argument( + '--metadata-csv', type=str, default=None, + help='CSV file with metadata overrides (Description,values format).', + ) + parser.add_argument( + '--add-image-name', action='store_true', + help='Include image name in output filenames.', + ) + + +def build_data_parser(): + data_parser = argparse.ArgumentParser( + prog='acdc-data', + description='Headless data structure tools for Cell-ACDC (BioIO only).', + formatter_class=argparse.RawTextHelpFormatter, + ) + subparsers = data_parser.add_subparsers(dest='command', required=True) + + metadata_parser = subparsers.add_parser( + 'metadata', + help='Extract metadata from a raw microscopy file.', + formatter_class=argparse.RawTextHelpFormatter, + ) + metadata_parser.add_argument( + '--input', '-i', required=True, type=str, + help='Path to a raw microscopy file or folder containing one file.', + ) + metadata_parser.add_argument( + '--output', '-o', type=str, default=None, + help='Output folder for metadata.csv and metadataXML.txt.', + ) + metadata_parser.add_argument( + '--format', choices=('text', 'json', 'csv'), default='text', + help='Output format (default: text).', + ) + metadata_parser.add_argument( + '--basename', type=str, default=None, + help='Basename for metadata.csv (default: derived from filename).', + ) + + convert_parser = subparsers.add_parser( + 'convert', + help='Convert raw microscopy files to the ACDC data structure.', + formatter_class=argparse.RawTextHelpFormatter, + ) + convert_parser.add_argument( + '--input', '-i', required=True, type=str, + help='Folder containing raw microscopy file(s).', + ) + convert_parser.add_argument( + '--output', '-o', required=True, type=str, + help='Experiment destination folder.', + ) + convert_parser.add_argument( + '--layout', required=True, + choices=tuple(LAYOUT_TO_RAW_DATA_STRUCT), + help='How raw files are arranged.', + ) + convert_parser.add_argument( + '--format', choices=('tif', 'h5'), default='tif', + help='Output image format (default: tif).', + ) + convert_parser.add_argument( + '--lazy-load', action=argparse.BooleanOptionalAction, default=True, + help='Load one frame at a time to reduce RAM usage (default: true).', + ) + convert_parser.add_argument( + '--move-raw', action=argparse.BooleanOptionalAction, default=True, + help='Move raw files to raw_microscopy_files/ after conversion.', + ) + convert_parser.add_argument( + '--on-existing', choices=('overwrite', 'add', 'create-new'), + default='overwrite', + help='Policy when destination already has Position folders.', + ) + convert_parser.add_argument( + '--basename', type=str, default=None, + help='Required for layout one-per-channel.', + ) + _add_metadata_override_args(convert_parser) + + restructure_parser = subparsers.add_parser( + 'restructure', + help='Restructure pre-processed image files into the ACDC folder layout.', + formatter_class=argparse.RawTextHelpFormatter, + ) + restructure_parser.add_argument( + '--input', '-i', required=True, type=str, + help='Folder containing pre-processed image files.', + ) + restructure_parser.add_argument( + '--output', '-o', required=True, type=str, + help='Destination experiment folder.', + ) + restructure_parser.add_argument( + '--layout', required=True, choices=RESTRUCTURE_LAYOUTS, + help=( + 'How files are arranged: multi-timepoint (one file per frame, ' + 'stack into TIFFs) or multi-channel (one file per channel, ' + 'organize into Position folders).' + ), + ) + restructure_parser.add_argument( + '--action', choices=('copy', 'move'), default='copy', + help='For multi-channel layout: copy or move files (default: copy).', + ) + restructure_parser.add_argument( + '--channels', type=str, default=None, + help=( + 'Required for multi-timepoint layout. Comma-separated channel ' + 'names matching filenames (e.g. pos1_GFP_1.tif → GFP).' + ), + ) + restructure_parser.add_argument( + '--basename', type=str, default=None, + help='Optional basename prepended to output filenames (multi-timepoint).', + ) + restructure_parser.add_argument( + '--segm-folder', type=str, default=None, + help=( + 'Optional folder with segmentation masks named like the raw files ' + '(multi-timepoint).' + ), + ) + + return data_parser + + +def parse_data_cli_args(argv=None): + if argv is None: + argv = sys.argv[1:] + data_parser = build_data_parser() + return vars(data_parser.parse_args(argv)) + + +def run(): + from cellacdc import _run + + parser_args = parse_data_cli_args() + command = parser_args['command'] + if command == 'metadata': + _run.run_data_metadata(parser_args) + elif command == 'convert': + _run.run_data_convert(parser_args) + elif command == 'restructure': + _run.run_data_restructure(parser_args) + else: + raise ValueError(f'Unknown command "{command}"') + + +def main(): + run() diff --git a/cellacdc/docs/source/getting-started.rst b/cellacdc/docs/source/getting-started.rst index a875bb79c..585372f3d 100644 --- a/cellacdc/docs/source/getting-started.rst +++ b/cellacdc/docs/source/getting-started.rst @@ -19,6 +19,62 @@ Running Cell-ACDC 2. **Activate** the **environment** (conda: ``conda activate acdc``, pip on Windows: ``.\env\Scripts\activate``, pip on Unix: ``source env/bin/activate``) 3. **Run** the command ``acdc`` or ``cellacdc`` +Command-line data tools +~~~~~~~~~~~~~~~~~~~~~~~ + +Cell-ACDC provides headless commands to extract metadata and convert raw microscopy +files to the required ACDC data structure (Module 0 equivalent). These commands use +BioIO only (no Java/python-bioformats), via the ``acdc-data`` entry point. + +Extract metadata from a raw microscopy file:: + + acdc-data metadata --input /path/to/file.czi + acdc-data metadata --input /path/to/file.czi --format json + acdc-data metadata --input /path/to/file.czi --output /path/to/Position_1/Images/ + +Convert raw microscopy files to the ACDC folder structure:: + + acdc-data convert \\ + --input /path/to/raw_files/ \\ + --output /path/to/experiment/ \\ + --layout one-per-pos \\ + --trust-metadata + +Layout options for ``--layout``: + +* ``single-multi-pos`` — single file containing multiple positions/series +* ``one-per-pos`` — one microscopy file per position +* ``one-per-channel`` — one file per channel (pattern ``basenameN_channelName``) + +Restructure pre-processed image files (GUI "Data Re-Struct" utility):: + + acdc-data restructure \\ + --input /path/to/image_files/ \\ + --output /path/to/experiment/ \\ + --layout multi-timepoint \\ + --channels GFP,mCherry + + acdc-data restructure \\ + --input /path/to/image_files/ \\ + --output /path/to/experiment/ \\ + --layout multi-channel \\ + --action move + +Restructure layout options: + +* ``multi-timepoint`` — one file per time-point (e.g. ``pos1_GFP_1.tif``), stacked into channel TIFFs +* ``multi-channel`` — flat channel files grouped into ``Position_n/Images/`` + +Additional useful flags for ``acdc-data convert``: + +* ``--format tif|h5`` — output image format (default: tif) +* ``--lazy-load`` / ``--no-lazy-load`` — load one frame at a time (default: lazy load) +* ``--on-existing overwrite|add|create-new`` — policy when destination has Position folders +* ``--channels "ch1,ch2"`` — override channel names +* ``--metadata-csv /path/to/overrides.csv`` — apply metadata overrides + +For interactive metadata review and visual confirmation, use GUI Module 0 instead. + The Main Menu ------------- The main menu is a **hub** through which you can access all relevant modules. diff --git a/pyproject.toml b/pyproject.toml index e96d7a741..4fd8d1036 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -142,6 +142,7 @@ dev = [ cellacdc = "cellacdc.__main__:run" acdc = "cellacdc.__main__:run" Cell-ACDC = "cellacdc.__main__:run" +acdc-data = "cellacdc.data_cli:run" [tool.setuptools] include-package-data = true diff --git a/tests/test_data_cli.py b/tests/test_data_cli.py new file mode 100644 index 000000000..09e4bc116 --- /dev/null +++ b/tests/test_data_cli.py @@ -0,0 +1,150 @@ +import pytest + +from cellacdc.data_cli import ( + HeadlessMetadataHandler, + ParsedRawMetadata, + RESTRUCTURE_LAYOUTS, + build_convert_state, + build_data_parser, + get_start_pos_n, + guess_basename_from_filepath, + layout_to_raw_data_struct, + metadata_to_dataframe, + on_existing_to_worker_flags, + parse_one_per_channel_files, + parse_time_range, + read_filename_pattern, +) + + +def test_build_data_parser_prog(): + assert build_data_parser().prog == 'acdc-data' + + +def test_layout_to_raw_data_struct(): + assert layout_to_raw_data_struct('single-multi-pos') == 0 + assert layout_to_raw_data_struct('one-per-pos') == 1 + assert layout_to_raw_data_struct('one-per-channel') == 2 + with pytest.raises(ValueError): + layout_to_raw_data_struct('invalid-layout') + + +def test_parse_time_range(): + assert parse_time_range(None) == (0, None) + assert parse_time_range('0:99') == (0, 99) + assert parse_time_range('120') == (0, 120) + + +def test_on_existing_to_worker_flags(): + assert on_existing_to_worker_flags('overwrite') == { + 'overwrite': True, 'add_files': False, 'create_new': False, + } + assert on_existing_to_worker_flags('add') == { + 'overwrite': False, 'add_files': True, 'create_new': False, + } + assert on_existing_to_worker_flags('create-new') == { + 'overwrite': False, 'add_files': False, 'create_new': True, + } + + +def test_parse_one_per_channel_files(): + files = ['ASY015_1_GFP.tif', 'ASY015_1_mNeon.tif', 'ASY015_2_GFP.tif'] + basename, pos_nums, ch_names = parse_one_per_channel_files(files) + assert basename == 'ASY015_' + assert pos_nums == [1, 2] + assert set(ch_names) == {'GFP', 'mNeon'} + + +def test_headless_metadata_handler_from_args(): + handler = HeadlessMetadataHandler.from_parser_args({ + 'trust_metadata': True, + 'format': 'h5', + 'channels': 'phase_contrast,GFP', + 'time_range': '0:10', + 'lens_na': 1.4, + }) + assert handler.to_h5 is True + assert handler.channels == ['phase_contrast', 'GFP'] + assert handler.time_range_end == 10 + assert handler.lens_na == 1.4 + + +def test_metadata_to_dataframe(): + parsed = ParsedRawMetadata( + lens_na=1.4, + size_t=10, + size_z=1, + channel_names=['phase_contrast', 'GFP'], + em_wavelens=[500.0, 525.0], + ) + df = metadata_to_dataframe(parsed, 'test_exp_') + assert df.at['basename', 'values'] == 'test_exp_' + assert df.at['SizeT', 'values'] == 10 + assert df.at['channel_0_name', 'values'] == 'phase_contrast' + assert df.at['channel_1_emWavelen', 'values'] == 525.0 + + +def test_guess_basename_from_filepath(): + assert guess_basename_from_filepath('/path/to/Example1.czi') == 'Example1_' + + +def test_metadata_parser_accepts_required_args(): + args = build_data_parser().parse_args([ + 'metadata', '--input', '/tmp/file.czi', + ]) + assert args.command == 'metadata' + assert args.input == '/tmp/file.czi' + + +def test_convert_parser_accepts_required_args(): + args = build_data_parser().parse_args([ + 'convert', + '--input', '/tmp/raw', + '--output', '/tmp/exp', + '--layout', 'one-per-pos', + ]) + assert args.command == 'convert' + assert args.layout == 'one-per-pos' + assert args.trust_metadata is True + + +def test_restructure_parser_accepts_required_args(): + args = build_data_parser().parse_args([ + 'restructure', + '--input', '/tmp/raw', + '--output', '/tmp/exp', + '--layout', 'multi-timepoint', + '--channels', 'GFP,mCherry', + ]) + assert args.command == 'restructure' + assert args.layout in RESTRUCTURE_LAYOUTS + assert args.channels == 'GFP,mCherry' + + +def test_read_filename_pattern(): + pos, frame, ch = read_filename_pattern('pos1_GFP_01.tif') + assert pos == 'pos1' + assert frame == '01' + assert ch == 'GFP' + + +def test_build_convert_state(): + parsed = ParsedRawMetadata( + lens_na=1.4, + size_t=10, + size_z=1, + channel_names=['phase_contrast'], + em_wavelens=[500.0], + ) + handler = HeadlessMetadataHandler(channels=['GFP'], to_h5=True) + state = build_convert_state(parsed, handler) + assert state.channel_names == ['GFP'] + assert state.to_h5 is True + assert state.size_c == 1 + + +def test_get_start_pos_n(tmp_path): + assert get_start_pos_n(str(tmp_path), 'overwrite') == 1 + (tmp_path / 'Position_1').mkdir() + (tmp_path / 'Position_3').mkdir() + assert get_start_pos_n(str(tmp_path), 'create-new') == 4 From faba29251ef11c50e347d2576c7198c7f4078fca Mon Sep 17 00:00:00 2001 From: keejkrej Date: Mon, 25 May 2026 17:17:11 +0200 Subject: [PATCH 2/3] Fix TimeIncrement loss in bioio metadata round-trip. Preserve time interval when metadata is serialized to disk and prevent CZI fallback mappers from overwriting parsed values with None. Co-authored-by: Cursor --- cellacdc/acdc_bioio_bioformats/reader.py | 49 +++++++++++++++++++++--- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/cellacdc/acdc_bioio_bioformats/reader.py b/cellacdc/acdc_bioio_bioformats/reader.py index 03a823bd8..d9523c5cf 100644 --- a/cellacdc/acdc_bioio_bioformats/reader.py +++ b/cellacdc/acdc_bioio_bioformats/reader.py @@ -109,13 +109,21 @@ class Channel: class Node: def __init__(self, image_filepath, bioimage_class): _, ext = os.path.splitext(image_filepath) + self._node = {} try: self._node = { 'TimeIncrement': bioimage_class.time_interval.total_seconds(), 'TimeIncrementUnit': 's' } - except Exception as err: - self._node = {} + except Exception: + time_increment = getattr(bioimage_class, 'time_increment', None) + if time_increment is not None: + self._node = { + 'TimeIncrement': time_increment, + 'TimeIncrementUnit': getattr( + bioimage_class, 'time_increment_unit', 's' + ), + } if ext not in EXTENSION_METADATA_ATTR_MAPPER: return @@ -123,9 +131,11 @@ def __init__(self, image_filepath, bioimage_class): name_expression_mapper = EXTENSION_METADATA_ATTR_MAPPER[ext] for name, expression in name_expression_mapper.items(): try: - self._node[name] = safe_get_or_call(bioimage_class, expression) + value = safe_get_or_call(bioimage_class, expression) + if value is not None: + self._node[name] = value except Exception as err: - self._node[name] = None + pass def get(self, name): value = self._node.get(name) @@ -153,7 +163,8 @@ class BioImageMetadata: def __init__( self, SizeT, SizeC, SizeZ, SizeY, SizeX, PhysicalSizeX, PhysicalSizeY, PhysicalSizeZ, - channel_names, image_count + channel_names, image_count, + time_increment=None, time_increment_unit='s', ): self.shape = (SizeT, SizeC, SizeZ, SizeY, SizeX) self.physical_pixel_sizes = PhysicalPixelSizes( @@ -161,6 +172,8 @@ def __init__( ) self.channel_names = channel_names self.scenes = list(range(image_count)) + self.time_increment = time_increment + self.time_increment_unit = time_increment_unit class OMEXML: def __init__(self): @@ -200,6 +213,16 @@ def __str__(self): f'PhysicalSizeZ: {self.bioimage.physical_pixel_sizes.Z}\n' f'Image count: {self.get_image_count()}' ) + try: + time_increment = self.Pixels.node.get('TimeIncrement') + time_increment_unit = self.Pixels.node.get('TimeIncrementUnit') + txt = ( + f'{txt}\n' + f'TimeIncrement: {time_increment}\n' + f'TimeIncrementUnit: {time_increment_unit}' + ) + except Exception: + pass return txt def to_file(self, filepath): @@ -230,11 +253,25 @@ def init_from_file(self, filepath, image_filepath): setattr(self, kwarg, dtype(value)) except Exception as err: setattr(self, kwarg, default) + + time_increment = None + time_increment_unit = 's' + time_increment_match = re.search(r'TimeIncrement: (.+)', txt) + if time_increment_match is not None: + try: + time_increment = float(time_increment_match.group(1)) + except Exception: + pass + time_increment_unit_match = re.search(r'TimeIncrementUnit: (.+)', txt) + if time_increment_unit_match is not None: + time_increment_unit = time_increment_unit_match.group(1) self.bioimage = BioImageMetadata( self.SizeT, self.SizeC, self.SizeZ, self.SizeY, self.SizeX, self.PhysicalSizeX, self.PhysicalSizeY, self.PhysicalSizeZ, - self.channel_names, self.image_count + self.channel_names, self.image_count, + time_increment=time_increment, + time_increment_unit=time_increment_unit, ) self._init_Pixels(image_filepath) From 10be9468d31c911ab45ba1fb9ac5afc0c079c14b Mon Sep 17 00:00:00 2001 From: keejkrej Date: Mon, 25 May 2026 18:53:57 +0200 Subject: [PATCH 3/3] Read channel emission and excitation wavelengths from bioio ND2/CZI readers. Expose OME-style Channel node metadata so ACDC metadata parsing picks up per-channel wavelengths from patched bioio readers. Co-authored-by: Cursor --- cellacdc/acdc_bioio_bioformats/reader.py | 48 +++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/cellacdc/acdc_bioio_bioformats/reader.py b/cellacdc/acdc_bioio_bioformats/reader.py index d9523c5cf..fd9ddd35a 100644 --- a/cellacdc/acdc_bioio_bioformats/reader.py +++ b/cellacdc/acdc_bioio_bioformats/reader.py @@ -106,6 +106,24 @@ def __str__(self): class Channel: pass +def _get_reader_channel_wavelengths(bioimage): + reader = getattr(bioimage, 'reader', None) + if reader is None: + return None, None + emission = getattr(reader, 'channel_emission_wavelengths', None) + excitation = getattr(reader, 'channel_excitation_wavelengths', None) + return emission, excitation + +def _read_channel_wavelengths_from_filepath(image_filepath): + from bioio import BioImage + + try: + kwargs = set_reader(image_filepath) + bioimage = BioImage(image_filepath, **kwargs) + return _get_reader_channel_wavelengths(bioimage) + except Exception: + return None, None + class Node: def __init__(self, image_filepath, bioimage_class): _, ext = os.path.splitext(image_filepath) @@ -144,10 +162,36 @@ def get(self, name): return value -class Pixels: +class Pixels: + def _get_channel_wavelengths(self): + if hasattr(self, '_channel_wavelength_cache'): + return self._channel_wavelength_cache + emission, excitation = _get_reader_channel_wavelengths( + getattr(self, 'bioimage', None) + ) + if emission is None: + image_filepath = getattr(self, 'image_filepath', None) + if image_filepath is not None: + emission, excitation = _read_channel_wavelengths_from_filepath( + image_filepath + ) + self._channel_wavelength_cache = (emission, excitation) + return self._channel_wavelength_cache + def Channel(self, c: int): channel = Channel() channel.Name = self.channel_names[c] + emission, excitation = self._get_channel_wavelengths() + node = {} + if emission is not None and c < len(emission): + em_wavelength = emission[c] + if em_wavelength is not None: + node['EmissionWavelength'] = str(em_wavelength) + if excitation is not None and c < len(excitation): + ex_wavelength = excitation[c] + if ex_wavelength is not None: + node['ExcitationWavelength'] = str(ex_wavelength) + channel.node = node return channel def get_omexml_metadata(image_filepath, qparent=None): @@ -195,6 +239,8 @@ def init_from_metadata(self, metadata: Metadata): def _init_Pixels(self, image_filepath): self.Pixels = Pixels() + self.Pixels.bioimage = self.bioimage + self.Pixels.image_filepath = image_filepath self.Pixels.node = Node(image_filepath, self.bioimage) self.Pixels.channel_names = self.bioimage.channel_names