Spaces:
Sleeping
Sleeping
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. | |
| import importlib | |
| from pathlib import Path | |
| import torch | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| def load_module_from_py_file(py_file: str) -> object: | |
| """ | |
| This method loads a module from a py file which is not in the Python path | |
| """ | |
| module_name = Path(py_file).name | |
| loader = importlib.machinery.SourceFileLoader(module_name, py_file) | |
| spec = importlib.util.spec_from_loader(module_name, loader) | |
| module = importlib.util.module_from_spec(spec) | |
| loader.exec_module(module) | |
| return module | |
| def get_custom_dataset(dataset_config, tokenizer, split: str): | |
| if ":" in dataset_config.file: | |
| module_path, func_name = dataset_config.file.split(":") | |
| else: | |
| module_path, func_name = dataset_config.file, "get_custom_dataset" | |
| if not module_path.endswith(".py"): | |
| raise ValueError(f"Dataset file {module_path} is not a .py file.") | |
| module_path = Path(module_path) | |
| if not module_path.is_file(): | |
| raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") | |
| module = load_module_from_py_file(module_path.as_posix()) | |
| try: | |
| return getattr(module, func_name)(dataset_config, tokenizer, split) | |
| except AttributeError as e: | |
| logger.info(f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).") | |
| raise e | |
| def get_preprocessed_dataset( | |
| tokenizer, dataset_config, split: str = "train" | |
| ) -> torch.utils.data.Dataset: | |
| def get_split(): | |
| return ( | |
| dataset_config.train_split | |
| if split == "train" | |
| else dataset_config.test_split | |
| ) | |
| return get_custom_dataset( | |
| dataset_config, | |
| tokenizer, | |
| get_split(), | |
| ) | |