Step 1: Load Data Task

The Load Data Task is the most critical component - it reads your data files and converts them into a standardized ResourceSet that the Cell Culture framework can process.

Required Output Structure

Your Load Data Task must produce multiple outputs:

resource_set (ResourceSet): Main output containing one table per batch-sample pair:Each table represents data for a specific batch-sample combination (e.g., "B001_S1", "B001_S2")Each table combines:Raw data (if available): Initial time-series measurementsFollow-up data (if available): Additional time-series measurementsEach table must have:Batch identifier column (e.g., "ESSAI", "Batch")Sample identifier column (e.g., "FERMENTEUR", "Sample")Time column (e.g., "Temps de culture (h)", "Time (h)")Measurement columns (OD, pH, glucose, etc.)Each table must be tagged with:Tag('batch', batch_value) - e.g., Tag('batch', 'B001')Tag('sample', sample_value) - e.g., Tag('sample', 'S1')Tag('medium', medium_name) - optional, with medium composition in additional_infoTag('missing_value', 'info, raw_data, follow_up') - optional, lists missing data types

venn_diagram (PlotlyResource, optional): Visualization showing data availability across different sources

medium_table (Table, optional): Processed medium composition table with numeric columns converted to float

metadata_table (Table, optional): Additional metadata table output (can be same as in ResourceSet)

Example Output Specs:

output_specs = OutputSpecs({
    'resource_set': OutputSpec(
        ResourceSet, 
        human_name="Resource set containing all the tables"
    ),
    'venn_diagram': OutputSpec(
        PlotlyResource, 
        human_name="Venn diagram of data availability", 
        optional=True
    ),
    'medium_table': OutputSpec(
        Table, 
        human_name="Medium composition table",
        short_description="Medium CSV file with numeric columns properly converted to float",
        optional=True
    ),
    'metadata_table': OutputSpec(
        Table, 
        human_name="Metadata table",
        short_description="Table containing metadata information",
        optional=True
    )
})

Example: Custom Load Data Task

from gws_core import (
    Task, task_decorator, InputSpec, OutputSpec, 
    InputSpecs, OutputSpecs, Table, ResourceSet, File
)

@task_decorator(
    "MyCustomLoadData",
    human_name="Load My Custom Data",
    short_description="Load custom fermentation data"
)
class MyCustomLoadData(Task):
    """
    Load custom fermentation data from files.
    """
    
    # Define input files
    input_specs = InputSpecs({
        'info_csv': InputSpec(File, human_name="Info CSV file"),
        'medium_csv': InputSpec(File, human_name="Medium composition CSV"),
        'timeseries_folder': InputSpec(Folder, human_name="Time series data folder")
    })
    
    # Define multiple outputs
    output_specs = OutputSpecs({
        'resource_set': OutputSpec(
            ResourceSet, 
            human_name="Resource set containing all the tables"
        ),
        'venn_diagram': OutputSpec(
            PlotlyResource, 
            human_name="Data availability visualization", 
            optional=True
        ),
        'medium_table': OutputSpec(
            Table, 
            human_name="Medium composition table",
            optional=True
        ),
        'metadata_table': OutputSpec(
            Table, 
            human_name="Metadata table",
            optional=True
        )
    })
    
    def run(self, params, inputs):
        # 1. Read your input files
        info_df = pd.read_csv(inputs['info_csv'].path)
        medium_df = pd.read_csv(inputs['medium_csv'].path)
        
        # 2. Process raw data files (initial measurements)
        raw_data_dict = {}  # Key: (batch, sample), Value: DataFrame
        for file in inputs['raw_data_folder'].list_files():
            df = pd.read_csv(file)
            # Extract batch/sample from filename or data
            batch = df['Batch'].iloc[0]
            sample = df['Sample'].iloc[0]
            raw_data_dict[(batch, sample)] = df
        
        # 3. Process follow-up data files (additional measurements)
        follow_up_dict = {}  # Key: (batch, sample), Value: DataFrame
        for file in inputs['follow_up_folder'].list_files():
            df = pd.read_csv(file)
            batch = df['Batch'].iloc[0]
            sample = df['Sample'].iloc[0]
            follow_up_dict[(batch, sample)] = df
        
        # 4. Create ResourceSet to hold all batch-sample tables
        resource_set = ResourceSet()
        
        # 5. For each batch-sample pair, create a merged table
        batch_sample_pairs = set(list(raw_data_dict.keys()) + list(follow_up_dict.keys()))
        
        for batch, sample in batch_sample_pairs:
            # Get raw data (if available)
            raw_df = raw_data_dict.get((batch, sample), pd.DataFrame())
            
            # Get follow-up data (if available)
            follow_up_df = follow_up_dict.get((batch, sample), pd.DataFrame())
            
            # Merge raw and follow-up data
            if not raw_df.empty and not follow_up_df.empty:
                # Concatenate along time axis
                merged_df = pd.concat([raw_df, follow_up_df], ignore_index=True)
                merged_df = merged_df.sort_values('Time (h)')
            elif not raw_df.empty:
                merged_df = raw_df
            elif not follow_up_df.empty:
                merged_df = follow_up_df
            else:
                continue  # Skip if no data
            
            # Create Table for this batch-sample pair
            table = Table(merged_df)
            table.name = f"{batch}_{sample}"
            
            # Add tags to the table
            table.add_tag(Tag('batch', batch))
            table.add_tag(Tag('sample', sample))
            
            # Track missing data types
            missing_values = []
            if (batch, sample) not in info_df[['Batch', 'Sample']].itertuples(index=False):
                missing_values.append('info')
            if (batch, sample) not in raw_data_dict:
                missing_values.append('raw_data')
            if (batch, sample) not in follow_up_dict:
                missing_values.append('follow_up')
            elif follow_up_dict[(batch, sample)].empty:
                missing_values.append('follow_up_empty')
            
            if missing_values:
                table.add_tag(Tag('missing_value', ', '.join(missing_values)))
            
            # Add medium information from info_df
            info_row = info_df[(info_df['Batch'] == batch) & (info_df['Sample'] == sample)]
            if not info_row.empty:
                medium_name = info_row['Medium'].iloc[0]
                if pd.notna(medium_name):  # Check medium is not NaN
                    # Get medium composition from medium_df
                    medium_row = medium_df[medium_df['MILIEU'] == medium_name]
                    if not medium_row.empty:
                        # Convert medium row to dict (excluding MILIEU column)
                        medium_composition = medium_row.iloc[0].to_dict()
                        # Remove the MILIEU key from the dict
                        medium_composition.pop('MILIEU', None)
                        # Add tag with medium name and composition in additional_info
                        table.add_tag(Tag('medium', medium_name, additional_info={'composed': medium_composition}))
            
            # Add column tags (is_index_column, is_data_column)
            for col in table.column_names:
                if col == 'Time (h)':
                    table.add_column_tag_by_name(col, 'is_index_column', 'true')
                elif col not in ['Batch', 'Sample', 'Medium']:
                    table.add_column_tag_by_name(col, 'is_data_column', 'true')
            
            # Add table to ResourceSet
            resource_set.add_resource(table)
        
        # 6. Create metadata table for ML analyses
        metadata_table = self._create_metadata_table(info_df, medium_df, follow_up_dict)
        
        # 7. Optional: Create visualization (Venn diagram)
        venn_diagram = self._create_venn_diagram(batch_sample_pairs, raw_data_dict, follow_up_dict)
        
        # 8. Optional: Process medium table
        medium_table_processed = self._process_medium_table(medium_df)
        
        # 9. Return all outputs (required + optional)
        return {
            'resource_set': resource_set,
            'venn_diagram': venn_diagram,
            'medium_table': medium_table_processed,
            'metadata_table': metadata_table
        }
    
    def _create_metadata_table(self, info_df, medium_df, follow_up_dict):
        """
        Create a metadata table for ML analyses.
        Each row represents a batch-sample pair with medium composition and follow-up medians.
        """
        metadata_rows = []
        
        for _, row in info_df.iterrows():
            batch = row['Batch']
            sample = row['Sample']
            medium = row['Medium']
            
            # Create row with Series identifier
            metadata_row = {
                'Series': f"{batch}_{sample}",
                'Medium': medium
            }
            
            # Add medium composition
            medium_row = medium_df[medium_df['MILIEU'] == medium]
            if not medium_row.empty:
                for col in medium_row.columns:
                    if col != 'MILIEU':
                        metadata_row[col] = medium_row[col].iloc[0]
            
            # Add follow-up data medians (if available)
            if (batch, sample) in follow_up_dict:
                follow_up_df = follow_up_dict[(batch, sample)]
                for col in follow_up_df.select_dtypes(include=[np.number]).columns:
                    if col not in ['Batch', 'Sample', 'Time (h)']:
                        metadata_row[f"{col}_median"] = follow_up_df[col].median()
            
            metadata_rows.append(metadata_row)
        
        return Table(pd.DataFrame(metadata_rows))
    
    def _create_venn_diagram(self, batch_sample_pairs, raw_data_dict, follow_up_dict):
        """Create a Venn diagram showing data availability across sources."""
        sample_sets = {
            'info': batch_sample_pairs,
            'raw_data': set(raw_data_dict.keys()),
            'follow_up': set(k for k, v in follow_up_dict.items() if not v.empty)
        }
        # Your Venn diagram creation logic
        fig = go.Figure()
        # ... create Venn diagram with sample_sets
        return PlotlyResource(fig)
    
    def _process_medium_table(self, medium_df):
        """Process medium table (convert to float, etc.)."""
        # Convert numeric columns to float
        for col in medium_df.select_dtypes(include=['object']).columns:
            if col != 'MILIEU':  # Skip medium name column
                medium_df[col] = pd.to_numeric(
                    medium_df[col].str.replace(',', '.'), 
                    errors='coerce'
                ).fillna(0)
        return Table(medium_df)

Critical Tags to Add

Your Load Data Task must add these tags to enable the framework:

On Each Batch-Sample Table (in ResourceSet):

Tag('batch', batch_value): The batch identifier (e.g., 'B001', 'ESSAI01')

Tag('sample', sample_value): The sample identifier (e.g., 'S1', 'FERMENTEUR1')

Tag('medium', medium_name): Optional, medium name with composition in additional_info

Tag('missing_value', 'info, raw_data, follow_up'): Optional, comma-separated list of missing data types

Column Tags on Each Table:

Time column: add_column_tag_by_name('Time (h)', 'is_index_column', 'true')

Data columns: add_column_tag_by_name('OD600', 'is_data_column', 'true') (for each measurement column)

Skip batch, sample, medium columns (these are metadata, not data)

Example:

table = Table(merged_df)
table.name = f"{batch}_{sample}"

# Table-level tags
table.add_tag(Tag('batch', batch))
table.add_tag(Tag('sample', sample))
table.add_tag(Tag('medium', 'Medium1', additional_info={'composed': {'Glucose': 10.0, 'Nitrogen': 2.0}}))
table.add_tag(Tag('missing_value', 'raw_data'))  # If raw data is missing

# Column-level tags
table.add_column_tag_by_name('Time (h)', 'is_index_column', 'true')
table.add_column_tag_by_name('OD600', 'is_data_column', 'true')
table.add_column_tag_by_name('pH', 'is_data_column', 'true')

Required Output Structure

Example: Custom Load Data Task

Critical Tags to Add

Have you developed a brick?