Source code for scythe.base
from typing import List, Iterator, Tuple, Iterable, Union, Sequence
from abc import ABC, abstractmethod
import logging
import os
from scythe.utils.grouping import preprocess_paths
logger = logging.getLogger(__name__)
[docs]class BaseExtractor(ABC):
"""Abstract base class for a metadata extractor
This class defines the interface for all extractors in Scythe. Each new extractor must
implement the :meth:`parse`, :meth:`version`, and :meth:`implementors` functions. The
:meth:`group` method should be overridden to generate smart groups of file (e.g., associating
the inputs and outputs to the same calculation) :meth:`citations` can be used if there
are papers that should be cited if the extractor is used as part of a scientific publication.
See the `Scythe Contributor Guide <contributor-guide.html>`_ for further details.
"""
[docs] def identify_files(self, path: str, context: dict = None) -> \
Iterator[Tuple[str]]:
"""Identify all groups of files likely to be compatible with this extractor
Uses the :meth:`group` function to determine groups of files that should be parsed together.
Args:
path (str): Root of directory to group together
context (dict): Context about the files
Yields:
([str]) Groups of eligible files
"""
# Walk through the directories
for root, dirs, files in os.walk(path):
# Generate the full paths
dirs = [os.path.join(root, d) for d in dirs]
files = [os.path.join(root, f) for f in files]
# Get any groups from this directory
for group in self.group(files, dirs, context):
yield group
[docs] def extract_directory(self, path: str, context: dict = None) -> \
Iterator[Tuple[Tuple[str], dict]]:
"""Run extractor on all appropriate files in a directory
Skips files that throw exceptions while parsing
Args:
path (str): Root of directory to extract metadata from
context (dict): Context about the files
Yields:
([str], dict): Tuple of the group identity and the metadata unit
"""
for group in self.identify_files(path, context):
try:
metadata_unit = self.extract(group, context)
except Exception:
continue
else:
yield group, metadata_unit
[docs] @abstractmethod
def extract(self, group: Iterable[str], context: dict = None) -> dict:
"""Extract metadata from a group of files
A group of files is a set of 1 or more files that describe the same object
and will be used together to create a single metadata record.
Arguments:
group ([str]): A list of one or more files that should be parsed together
context (dict): Context about the files
Returns:
(dict): The parsed results, in JSON-serializable format.
"""
[docs] def group(self, files: Union[str, List[str]], directories: List[str] = None,
context: dict = None) -> Iterator[Tuple[str, ...]]:
"""Identify a groups of files and directories that should be parsed together
Will create groups using only the files and directories included as input.
The files of files are _all_ files that could be read by this extractor,
which may include many false positives.
Args:
files (str or [str]): List of files to consider grouping
directories ([str]): Any directories to consider group as well
context (dict): Context about the files
Yields:
((str)): Groups of files
"""
# Make sure file paths are strings or Path-like objects
files = preprocess_paths(files)
# Default: Every file is in its own group
for f in files:
yield f,
[docs] def citations(self) -> List[str]:
"""Citation(s) and reference(s) for this extractor
Returns:
([str]): each element should be a string citation in BibTeX format
"""
return []
[docs] @abstractmethod
def implementors(self) -> List[str]:
"""List of implementors of the extractor
These people are the points-of-contact for addressing errors or modifying the extractor
Returns:
([str]): List of implementors in the form "FirstName LastName <email@provider>"
"""
[docs] @abstractmethod
def version(self) -> str:
"""Return the version of the extractor
Returns:
(str): Version of the extractor
"""
@property
def schema(self) -> dict:
"""Schema for the output of the extractor"""
return {
"$schema": "http://json-schema.org/schema#"
}
class BaseSingleFileExtractor(BaseExtractor):
"""Base class for extractors that only ever considers a single file at a time
Instead of implementing :meth:`parse`, implement :meth:`_parse_file`"""
@abstractmethod
def _extract_file(self, path: str, context=None):
"""Generate the metadata for a single file
Args:
path (str): Path to the file
context (dict): Optional context information about the file
Returns:
(dict): Metadata for the file
"""
def extract(self, group: Union[str, Sequence[str]], context=None):
# Error catching: allows for single files to passed not as list
if isinstance(group, str):
return self._extract_file(group, context)
# Assumes that the group must have exactly one file
if len(group) > 1:
raise ValueError('Extractor only takes a single file at a time')
return self._extract_file(group[0], context)