The Load Data Task is the most critical component - it reads your data files and converts them into a standardized ResourceSet that the Cell Culture framework can process.
Required Output Structure
Your Load Data Task must produce multiple outputs:
-
resource_set (ResourceSet): Main output containing one table per batch-sample pair:Each table represents data for a specific batch-sample combination (e.g., "B001_S1", "B001_S2")Each table combines:Raw data (if available): Initial time-series measurementsFollow-up data (if available): Additional time-series measurementsEach table must have:Batch identifier column (e.g., "ESSAI", "Batch")Sample identifier column (e.g., "FERMENTEUR", "Sample")Time column (e.g., "Temps de culture (h)", "Time (h)")Measurement columns (OD, pH, glucose, etc.)Each table must be tagged with:Tag('batch', batch_value) - e.g., Tag('batch', 'B001')Tag('sample', sample_value) - e.g., Tag('sample', 'S1')Tag('medium', medium_name) - optional, with medium composition in additional_infoTag('missing_value', 'info, raw_data, follow_up') - optional, lists missing data types
-
venn_diagram (PlotlyResource, optional): Visualization showing data availability across different sources
-
medium_table (Table, optional): Processed medium composition table with numeric columns converted to float
-
metadata_table (Table, optional): Additional metadata table output (can be same as in ResourceSet)
Example Output Specs:
output_specs = OutputSpecs({
'resource_set': OutputSpec(
ResourceSet,
human_name="Resource set containing all the tables"
),
'venn_diagram': OutputSpec(
PlotlyResource,
human_name="Venn diagram of data availability",
optional=True
),
'medium_table': OutputSpec(
Table,
human_name="Medium composition table",
short_description="Medium CSV file with numeric columns properly converted to float",
optional=True
),
'metadata_table': OutputSpec(
Table,
human_name="Metadata table",
short_description="Table containing metadata information",
optional=True
)
})
Example: Custom Load Data Task
from gws_core import (
Task, task_decorator, InputSpec, OutputSpec,
InputSpecs, OutputSpecs, Table, ResourceSet, File
)
@task_decorator(
"MyCustomLoadData",
human_name="Load My Custom Data",
short_description="Load custom fermentation data"
)
class MyCustomLoadData(Task):
"""
Load custom fermentation data from files.
"""
# Define input files
input_specs = InputSpecs({
'info_csv': InputSpec(File, human_name="Info CSV file"),
'medium_csv': InputSpec(File, human_name="Medium composition CSV"),
'timeseries_folder': InputSpec(Folder, human_name="Time series data folder")
})
# Define multiple outputs
output_specs = OutputSpecs({
'resource_set': OutputSpec(
ResourceSet,
human_name="Resource set containing all the tables"
),
'venn_diagram': OutputSpec(
PlotlyResource,
human_name="Data availability visualization",
optional=True
),
'medium_table': OutputSpec(
Table,
human_name="Medium composition table",
optional=True
),
'metadata_table': OutputSpec(
Table,
human_name="Metadata table",
optional=True
)
})
def run(self, params, inputs):
# 1. Read your input files
info_df = pd.read_csv(inputs['info_csv'].path)
medium_df = pd.read_csv(inputs['medium_csv'].path)
# 2. Process raw data files (initial measurements)
raw_data_dict = {} # Key: (batch, sample), Value: DataFrame
for file in inputs['raw_data_folder'].list_files():
df = pd.read_csv(file)
# Extract batch/sample from filename or data
batch = df['Batch'].iloc[0]
sample = df['Sample'].iloc[0]
raw_data_dict[(batch, sample)] = df
# 3. Process follow-up data files (additional measurements)
follow_up_dict = {} # Key: (batch, sample), Value: DataFrame
for file in inputs['follow_up_folder'].list_files():
df = pd.read_csv(file)
batch = df['Batch'].iloc[0]
sample = df['Sample'].iloc[0]
follow_up_dict[(batch, sample)] = df
# 4. Create ResourceSet to hold all batch-sample tables
resource_set = ResourceSet()
# 5. For each batch-sample pair, create a merged table
batch_sample_pairs = set(list(raw_data_dict.keys()) + list(follow_up_dict.keys()))
for batch, sample in batch_sample_pairs:
# Get raw data (if available)
raw_df = raw_data_dict.get((batch, sample), pd.DataFrame())
# Get follow-up data (if available)
follow_up_df = follow_up_dict.get((batch, sample), pd.DataFrame())
# Merge raw and follow-up data
if not raw_df.empty and not follow_up_df.empty:
# Concatenate along time axis
merged_df = pd.concat([raw_df, follow_up_df], ignore_index=True)
merged_df = merged_df.sort_values('Time (h)')
elif not raw_df.empty:
merged_df = raw_df
elif not follow_up_df.empty:
merged_df = follow_up_df
else:
continue # Skip if no data
# Create Table for this batch-sample pair
table = Table(merged_df)
table.name = f"{batch}_{sample}"
# Add tags to the table
table.add_tag(Tag('batch', batch))
table.add_tag(Tag('sample', sample))
# Track missing data types
missing_values = []
if (batch, sample) not in info_df[['Batch', 'Sample']].itertuples(index=False):
missing_values.append('info')
if (batch, sample) not in raw_data_dict:
missing_values.append('raw_data')
if (batch, sample) not in follow_up_dict:
missing_values.append('follow_up')
elif follow_up_dict[(batch, sample)].empty:
missing_values.append('follow_up_empty')
if missing_values:
table.add_tag(Tag('missing_value', ', '.join(missing_values)))
# Add medium information from info_df
info_row = info_df[(info_df['Batch'] == batch) & (info_df['Sample'] == sample)]
if not info_row.empty:
medium_name = info_row['Medium'].iloc[0]
if pd.notna(medium_name): # Check medium is not NaN
# Get medium composition from medium_df
medium_row = medium_df[medium_df['MILIEU'] == medium_name]
if not medium_row.empty:
# Convert medium row to dict (excluding MILIEU column)
medium_composition = medium_row.iloc[0].to_dict()
# Remove the MILIEU key from the dict
medium_composition.pop('MILIEU', None)
# Add tag with medium name and composition in additional_info
table.add_tag(Tag('medium', medium_name, additional_info={'composed': medium_composition}))
# Add column tags (is_index_column, is_data_column)
for col in table.column_names:
if col == 'Time (h)':
table.add_column_tag_by_name(col, 'is_index_column', 'true')
elif col not in ['Batch', 'Sample', 'Medium']:
table.add_column_tag_by_name(col, 'is_data_column', 'true')
# Add table to ResourceSet
resource_set.add_resource(table)
# 6. Create metadata table for ML analyses
metadata_table = self._create_metadata_table(info_df, medium_df, follow_up_dict)
# 7. Optional: Create visualization (Venn diagram)
venn_diagram = self._create_venn_diagram(batch_sample_pairs, raw_data_dict, follow_up_dict)
# 8. Optional: Process medium table
medium_table_processed = self._process_medium_table(medium_df)
# 9. Return all outputs (required + optional)
return {
'resource_set': resource_set,
'venn_diagram': venn_diagram,
'medium_table': medium_table_processed,
'metadata_table': metadata_table
}
def _create_metadata_table(self, info_df, medium_df, follow_up_dict):
"""
Create a metadata table for ML analyses.
Each row represents a batch-sample pair with medium composition and follow-up medians.
"""
metadata_rows = []
for _, row in info_df.iterrows():
batch = row['Batch']
sample = row['Sample']
medium = row['Medium']
# Create row with Series identifier
metadata_row = {
'Series': f"{batch}_{sample}",
'Medium': medium
}
# Add medium composition
medium_row = medium_df[medium_df['MILIEU'] == medium]
if not medium_row.empty:
for col in medium_row.columns:
if col != 'MILIEU':
metadata_row[col] = medium_row[col].iloc[0]
# Add follow-up data medians (if available)
if (batch, sample) in follow_up_dict:
follow_up_df = follow_up_dict[(batch, sample)]
for col in follow_up_df.select_dtypes(include=[np.number]).columns:
if col not in ['Batch', 'Sample', 'Time (h)']:
metadata_row[f"{col}_median"] = follow_up_df[col].median()
metadata_rows.append(metadata_row)
return Table(pd.DataFrame(metadata_rows))
def _create_venn_diagram(self, batch_sample_pairs, raw_data_dict, follow_up_dict):
"""Create a Venn diagram showing data availability across sources."""
sample_sets = {
'info': batch_sample_pairs,
'raw_data': set(raw_data_dict.keys()),
'follow_up': set(k for k, v in follow_up_dict.items() if not v.empty)
}
# Your Venn diagram creation logic
fig = go.Figure()
# ... create Venn diagram with sample_sets
return PlotlyResource(fig)
def _process_medium_table(self, medium_df):
"""Process medium table (convert to float, etc.)."""
# Convert numeric columns to float
for col in medium_df.select_dtypes(include=['object']).columns:
if col != 'MILIEU': # Skip medium name column
medium_df[col] = pd.to_numeric(
medium_df[col].str.replace(',', '.'),
errors='coerce'
).fillna(0)
return Table(medium_df)
Critical Tags to Add
Your Load Data Task must add these tags to enable the framework:
On Each Batch-Sample Table (in ResourceSet):
-
Tag('batch', batch_value): The batch identifier (e.g., 'B001', 'ESSAI01') -
Tag('sample', sample_value): The sample identifier (e.g., 'S1', 'FERMENTEUR1') -
Tag('medium', medium_name): Optional, medium name with composition in additional_info -
Tag('missing_value', 'info, raw_data, follow_up'): Optional, comma-separated list of missing data types
Column Tags on Each Table:
- Time column:
add_column_tag_by_name('Time (h)', 'is_index_column', 'true') - Data columns:
add_column_tag_by_name('OD600', 'is_data_column', 'true') (for each measurement column) - Skip batch, sample, medium columns (these are metadata, not data)
Example:
table = Table(merged_df)
table.name = f"{batch}_{sample}"
# Table-level tags
table.add_tag(Tag('batch', batch))
table.add_tag(Tag('sample', sample))
table.add_tag(Tag('medium', 'Medium1', additional_info={'composed': {'Glucose': 10.0, 'Nitrogen': 2.0}}))
table.add_tag(Tag('missing_value', 'raw_data')) # If raw data is missing
# Column-level tags
table.add_column_tag_by_name('Time (h)', 'is_index_column', 'true')
table.add_column_tag_by_name('OD600', 'is_data_column', 'true')
table.add_column_tag_by_name('pH', 'is_data_column', 'true')