DataSet Manager API Documentation
Container for experiment metadata and associated data file locations.
A DataSet instance describes a single experiment or dataset in terms of:
- Basic metadata: name, creation date, experiment date/time, device identifier.
- User annotations: free-form notes and a simple time-stamped console log.
- File layout: a mapping from human-readable labels to absolute file paths.
- Optional colour mapping: label-to-colour mapping for consistent plotting.
- Structure type: how files are organised on disk (e.g. flat vs. directory-labelled).
The class does not interpret the contents of the files; it only tracks their
locations and minimal metadata. Filepaths can be:
- Added manually via add_filepath, or
- Auto-populated from a root directory using construct_filepaths(...).
Structure types
"flat": A simple one-level mapping from label -> file path. This is the default and most common mode;construct_filepaths_nrecursivewill scan a single directory and register all supported files using their stem as label."dirlabelled": A directory-labelled mode where each top-level directory becomes a label and contains its own mapping of files. This is handled byconstruct_structured_filepathsand is considered deprecated in favour of the genericconstruct_filepaths."structured": Reserved for legacy/experimental layouts; treated as an allowed but user-managed structure type.
Validation
All paths added to the dataset are checked for:
- Existence on disk.
- Being a file (not a directory).
- Having an extension in the accepted set (xlsx, xls, csv,
txt, dpt, json).
Two DataSet instances are considered equal if all their attributes
(including filepaths, colours, and metadata) match exactly.
Source code in dataset_manager/dataset.py
@decorate_class_with_logging(log_level=DEBUG)
class DataSet:
"""
Container for experiment metadata and associated data file locations.
A `DataSet` instance describes a single experiment or dataset in terms of:
- Basic metadata: name, creation date, experiment date/time, device identifier.
- User annotations: free-form notes and a simple time-stamped console log.
- File layout: a mapping from human-readable labels to absolute file paths.
- Optional colour mapping: label-to-colour mapping for consistent plotting.
- Structure type: how files are organised on disk (e.g. flat vs. directory-labelled).
The class does *not* interpret the contents of the files; it only tracks their
locations and minimal metadata. Filepaths can be:
- Added manually via `add_filepath`, or
- Auto-populated from a root directory using `construct_filepaths(...)`.
Structure types
----------------
- ``"flat"``:
A simple one-level mapping from label -> file path. This is the default
and most common mode; `construct_filepaths_nrecursive` will scan a single
directory and register all supported files using their stem as label.
- ``"dirlabelled"``:
A directory-labelled mode where each top-level directory becomes a label
and contains its own mapping of files. This is handled by
`construct_structured_filepaths` and is considered deprecated in favour
of the generic `construct_filepaths`.
- ``"structured"``:
Reserved for legacy/experimental layouts; treated as an allowed but
user-managed structure type.
Validation
----------
All paths added to the dataset are checked for:
- Existence on disk.
- Being a file (not a directory).
- Having an extension in the accepted set (``xlsx``, ``xls``, ``csv``,
``txt``, ``dpt``, ``json``).
Two `DataSet` instances are considered equal if all their attributes
(including filepaths, colours, and metadata) match exactly.
"""
# TODO: properly deprecate structured, assume flat for now
_allowed_structure_types = ("flat", "dirlabelled", "structured")
_accepted_extensions = ("xlsx", "xls", "csv", "txt", "dpt", "json")
def __init__(self, creation_date: str):
if not isinstance(creation_date, str):
raise ValueError("creation_date must be a string in format YYYY.MM.DD_HH.MM.SS")
self.location = None
self.name = ""
self.creation_date = datetime.strptime(creation_date, "%Y.%m.%d_%H.%M.%S")
self.experiment_date_time = None
# FEATURE REQUEST: Allow multiple device types to be compatible with the same set
self.device = ""
self.notes = ""
self.console = {}
self.structure_type = None
self.filepaths = {}
self.colours = {}
# Setters
def set_name(self, name: str):
if not isinstance(name, str):
raise ValueError("name must be a string")
self.name = name
def set_experiment_date(self, experiment_date_time: str):
if not isinstance(experiment_date_time, str):
raise ValueError("experiment_date_time must be a string in format YYYY.MM.DD_HH.MM.SS")
self.experiment_date_time = datetime.strptime(experiment_date_time, "%Y.%m.%d_%H.%M.%S")
def set_device(self, device: str):
if not isinstance(device, str):
raise ValueError("device must be a string")
self.device = device
def set_structure_type(self, desired_type: str):
if not desired_type in self._allowed_structure_types:
raise ValueError
if self.structure_type is None:
self.structure_type = desired_type
# Warn users when trying to overwrite the structure type
def set_notes(self, notes_content: str):
if not isinstance(notes_content, str):
raise ValueError("notes_content must be a string")
self.notes = notes_content
def set_console(self, console_content: dict):
if not isinstance(console_content, dict):
raise ValueError("console_content must be a dict")
self.console = console_content
def set_filepaths(self, filepaths: dict):
if not isinstance(filepaths, dict):
raise ValueError("filepaths must be a dict")
self.filepaths = filepaths
def set_colours(self, colours: dict):
if not isinstance(colours, dict):
raise ValueError("colours must be a dict")
self.colours = colours
def set_location(self, location: str):
if not isinstance(location, str):
raise ValueError("location must be a string/path")
self.location = location
def construct_filepaths(self, root_dir: str, type: str) -> str:
warnings.warn("New function construct_filepaths_nrecursive not implemented recursively")
# TODO: Should depend on experiment type (making structure redundant)?
# TODO: Something about the experiment type compatibility here.
if type in self._allowed_structure_types:
self.set_structure_type(type)
else:
return f"Incompatible structure type ({type}). Choose from {self._allowed_structure_types}"
match type:
case "flat":
return self.construct_filepaths_nrecursive(root_dir)
case "dirlabelled":
return self.construct_structured_filepaths(root_dir)
def construct_filepaths_nrecursive(self, root_dir) -> str:
"""
Will generate a flat file set and add it to the current filepaths. This will seek all files and
of the giver root_dir and append all dataset files to the filepaths attribute. Note that
root_dir should be an absolute path.
"""
errors = ""
# Checks which files are contained in the root dir
items = natsort.natsorted(os.listdir(root_dir))
for item in items:
# Ignores duplicates
if item in self.filepaths.keys():
errors += f"Ignored {item}: duplicate label \n"
continue
# Only add valid files
path = f"{root_dir}/{item}"
is_path_valid, error_msg = self._check_valid_path(path)
if is_path_valid:
# Use filename as path label
self.add_filepath(path=path, label=Path(path).stem)
else:
errors += error_msg
return errors
def construct_filepaths_recursive(self, root_dir) -> str:
raise NotImplementedError
def construct_structured_filepaths(self, root_dir: str) -> str:
"""
Will generate a dirlabelled file set and add it to the current filepaths. This will seek all files and
subdirectories of the giver root_dir and append all dataset files to the filepaths attribute. Note that
root_dir should be an absolute path.
"""
warnings.warn("Function construct_structured_filepaths is deprecated use construct_filepaths instead", DeprecationWarning)
if self.get_structure_type() != "flat":
errors = ""
items = natsort.natsorted(os.listdir(root_dir))
for item in items:
if item in self.filepaths.keys():
errors += f"Ignored {item}: duplicate label \n"
continue
# Create nested dict for subdirectories
path = f"{root_dir}/{item}"
if not os.path.isfile(path):
self.filepaths[item] = {}
for file in natsort.natsorted(os.listdir(path)):
# Only append to dataset if file is actually a file with an accepted extension
filepath = f"{path}/{file}"
is_path_valid, error_msg = self._check_valid_path(filepath)
if is_path_valid:
self.filepaths[item][file] = filepath
else:
errors += error_msg
else:
errors = "Flat dataset_manager cannot use dirlabelled construction"
return errors
# Getters
def get_filepath(self, label: str) -> str:
return self.filepaths[label]
def get_filepaths(self) -> dict:
return self.filepaths
def get_experiment_date(self):
return self.experiment_date_time
def get_single_colour(self, label: str) -> str:
if label in self.colours.keys():
return self.colours[label]
return None
def get_all_colours(self) -> dict:
if len(self.colours) == 0:
return None
return self.colours
def get_labels(self):
return self.filepaths.keys()
def get_console(self) -> dict:
return self.console
def get_notes(self) -> str:
return self.notes
def get_structure_type(self) -> str:
return self.structure_type
def get_device(self) -> str:
return self.device
def get_creation_date(self) -> datetime:
return self.creation_date
def get_name(self) -> str:
return self.name
def get_location(self) -> str | None:
if self.location:
return self.location
return None
# Adding / Appending
def add_notes(self, additional_notes: str):
if not isinstance(additional_notes, str):
raise ValueError("additional_notes must be a string")
self.notes += additional_notes
def add_console(self, date_and_time: str, additional_console: str):
if not isinstance(date_and_time, str):
raise ValueError("date_and_time must be a string")
if not isinstance(additional_console, str):
raise ValueError("additional_console must be a string")
self.console[date_and_time] = additional_console
# Path management
def add_filepath(self, path: str, label: str):
# Wrap flat paths for validation
if self.get_structure_type() == 'flat':
path_to_validate = {label: path}
else:
path_to_validate = path
path_to_store = path
# Check for duplicate label
if label in self.filepaths.keys():
return "Duplicate label found in dataset_manager"
# Check that all paths are valid
for sublabel in path_to_validate:
# Check path before adding:
is_path_valid, error_msg = self._check_valid_path(path=path_to_validate[sublabel])
if not is_path_valid:
print(error_msg)
return "Will not add file with disallowed extension"
# Add the path
self.filepaths[label] = path_to_store
return ""
def add_colour(self, colour: str, label: str):
# Checks for duplicate label
if label in self.colours.keys():
return "Duplicate label found in colours"
else:
# Add the file to the dataset and update the gui
self.colours[label] = colour
# Checks are needed before paths are added to the dataset_manager
def _check_valid_path(self, path: str):
if not isinstance(path, str):
raise ValueError("path must be a string")
# Checks whether the path exists and points to a file
if os.path.exists(path) and os.path.isfile(path):
# Checks if the file has the proper extension
if path.endswith(self._accepted_extensions):
return True, ""
else:
return False, f"DataSet Forbidden Extension: Ignored {path}\n"
elif os.path.exists(path) and not os.path.isfile(path):
return False, f"DataSet Not a File: Ignored {path}\n"
else:
return False, f"DataSet Filesystem Error: Ignored {path}\n"
def __eq__(self, other):
if type(other) is type(self):
return self.__dict__ == other.__dict__
return False
construct_filepaths_nrecursive(root_dir)
Will generate a flat file set and add it to the current filepaths. This will seek all files and of the giver root_dir and append all dataset files to the filepaths attribute. Note that root_dir should be an absolute path.
Source code in dataset_manager/dataset.py
def construct_filepaths_nrecursive(self, root_dir) -> str:
"""
Will generate a flat file set and add it to the current filepaths. This will seek all files and
of the giver root_dir and append all dataset files to the filepaths attribute. Note that
root_dir should be an absolute path.
"""
errors = ""
# Checks which files are contained in the root dir
items = natsort.natsorted(os.listdir(root_dir))
for item in items:
# Ignores duplicates
if item in self.filepaths.keys():
errors += f"Ignored {item}: duplicate label \n"
continue
# Only add valid files
path = f"{root_dir}/{item}"
is_path_valid, error_msg = self._check_valid_path(path)
if is_path_valid:
# Use filename as path label
self.add_filepath(path=path, label=Path(path).stem)
else:
errors += error_msg
return errors
construct_structured_filepaths(root_dir)
Will generate a dirlabelled file set and add it to the current filepaths. This will seek all files and subdirectories of the giver root_dir and append all dataset files to the filepaths attribute. Note that root_dir should be an absolute path.
Source code in dataset_manager/dataset.py
def construct_structured_filepaths(self, root_dir: str) -> str:
"""
Will generate a dirlabelled file set and add it to the current filepaths. This will seek all files and
subdirectories of the giver root_dir and append all dataset files to the filepaths attribute. Note that
root_dir should be an absolute path.
"""
warnings.warn("Function construct_structured_filepaths is deprecated use construct_filepaths instead", DeprecationWarning)
if self.get_structure_type() != "flat":
errors = ""
items = natsort.natsorted(os.listdir(root_dir))
for item in items:
if item in self.filepaths.keys():
errors += f"Ignored {item}: duplicate label \n"
continue
# Create nested dict for subdirectories
path = f"{root_dir}/{item}"
if not os.path.isfile(path):
self.filepaths[item] = {}
for file in natsort.natsorted(os.listdir(path)):
# Only append to dataset if file is actually a file with an accepted extension
filepath = f"{path}/{file}"
is_path_valid, error_msg = self._check_valid_path(filepath)
if is_path_valid:
self.filepaths[item][file] = filepath
else:
errors += error_msg
else:
errors = "Flat dataset_manager cannot use dirlabelled construction"
return errors
Bases: JSONEncoder
JSON encoder for DataSet objects and related dataclasses.
This encoder provides two custom behaviours:
datetime.datetimeinstances are serialised to a compact string representation using the format"%Y.%m.%d_%H.%M.%S". This matches the format expected byDataSetand the corresponding JSON decoder.- All other objects are serialised via their
__dict__attribute, which is sufficient for simple container-like classes such asDataSet.
The encoder is intended to be used together with DataSetJSONEncoder to
provide a round-trip-safe JSON representation of datasets.
Source code in dataset_manager/dataset_json_encoder.py
@decorate_class_with_logging(log_level=DEBUG)
class DataSetJSONEncoder(JSONEncoder):
"""
JSON encoder for `DataSet` objects and related dataclasses.
This encoder provides two custom behaviours:
- ``datetime.datetime`` instances are serialised to a compact string
representation using the format ``"%Y.%m.%d_%H.%M.%S"``. This matches
the format expected by `DataSet` and the corresponding JSON decoder.
- All other objects are serialised via their ``__dict__`` attribute,
which is sufficient for simple container-like classes such as `DataSet`.
The encoder is intended to be used together with `DataSetJSONEncoder` to
provide a round-trip-safe JSON representation of datasets.
"""
def default(self, o):
if isinstance(o, datetime.datetime):
return o.strftime("%Y.%m.%d_%H.%M.%S")
else:
return o.__dict__
Bases: JSONDecoder
Custom JSON decoder that reconstructs DataSet instances from JSON.
This decoder installs an object_hook that:
- Detects dictionaries carrying the expected DataSet fields
(e.g. creation_date, name, device, experiment_date_time,
notes, console, structure_type, filepaths, colours).
- Instantiates a new DataSet using the stored creation date.
- Replays all relevant setters to restore metadata, structure type, paths,
colours, and annotations.
If a JSON object does not match the expected shape, it is returned unchanged,
allowing non-DataSet data to be decoded normally.
Source code in dataset_manager/dataset_json_decoder.py
@decorate_class_with_logging(log_level=DEBUG)
class DataSetJSONDecoder(JSONDecoder):
"""
Custom JSON decoder that reconstructs `DataSet` instances from JSON.
This decoder installs an `object_hook` that:
- Detects dictionaries carrying the expected `DataSet` fields
(e.g. ``creation_date``, ``name``, ``device``, ``experiment_date_time``,
``notes``, ``console``, ``structure_type``, ``filepaths``, ``colours``).
- Instantiates a new `DataSet` using the stored creation date.
- Replays all relevant setters to restore metadata, structure type, paths,
colours, and annotations.
If a JSON object does not match the expected shape, it is returned unchanged,
allowing non-`DataSet` data to be decoded normally.
"""
def __init__(self, **kwargs):
kwargs.setdefault("object_hook", self.object_hook)
super().__init__(**kwargs)
@staticmethod
def object_hook(dct):
try:
dataset = DataSet(dct['creation_date'])
dataset.set_name(dct['name'])
dataset.set_device(dct['device'])
dataset.set_experiment_date(dct['experiment_date_time'])
dataset.set_notes(dct['notes'])
dataset.set_console(dct['console'])
dataset.set_structure_type(dct['structure_type'])
dataset.set_filepaths(dct['filepaths'])
dataset.set_colours(dct['colours'])
return dataset
except KeyError:
return dct
Small helper class for persisting DataSet instances to and from JSON files.
Responsibilities
save_dataset(dataset, file_name): Serialises aDataSetinstance to disk usingDataSetJSONEncoder. The method checks that the passed object is aDataSetand writes the encoded JSON to the given file path.open_dataset(file_name): Opens a JSON file and deserialises it into aDataSetinstance usingDataSetJSONDecoder.
The manager does not interpret the dataset content; it only handles the
IO and wiring between JSON encoder/decoder and the underlying DataSet
objects.
Source code in dataset_manager/dataset_manager.py
@decorate_class_with_logging(log_level=DEBUG)
class DataSetManager:
"""
Small helper class for persisting `DataSet` instances to and from JSON files.
Responsibilities
----------------
- `save_dataset(dataset, file_name)`:
Serialises a `DataSet` instance to disk using `DataSetJSONEncoder`.
The method checks that the passed object is a `DataSet` and writes
the encoded JSON to the given file path.
- `open_dataset(file_name)`:
Opens a JSON file and deserialises it into a `DataSet` instance using
`DataSetJSONDecoder`.
The manager does not interpret the dataset content; it only handles the
IO and wiring between JSON encoder/decoder and the underlying `DataSet`
objects.
"""
def __init__(self):
pass
@staticmethod
def save_dataset(dataset, file_name):
""" Saves the dataset_manager data into a JSON file """
if not isinstance(dataset, DataSet):
raise ValueError("dataset must be an instance of DataSet")
# Who should check whether the filename is valid?
with open(file_name, "w") as json_file:
json.dump(dataset, json_file, cls=DataSetJSONEncoder)
json_file.close()
@staticmethod
def open_dataset(file_name):
with open(file_name) as json_file:
return json.load(json_file, cls=DataSetJSONDecoder)
save_dataset(dataset, file_name)
staticmethod
Saves the dataset_manager data into a JSON file
Source code in dataset_manager/dataset_manager.py
@staticmethod
def save_dataset(dataset, file_name):
""" Saves the dataset_manager data into a JSON file """
if not isinstance(dataset, DataSet):
raise ValueError("dataset must be an instance of DataSet")
# Who should check whether the filename is valid?
with open(file_name, "w") as json_file:
json.dump(dataset, json_file, cls=DataSetJSONEncoder)
json_file.close()