Source code for scythe.base

from typing import List, Iterator, Tuple, Iterable, Union, Sequence
from abc import ABC, abstractmethod
import logging
import os

from scythe.utils.grouping import preprocess_paths

logger = logging.getLogger(__name__)


[docs]class BaseExtractor(ABC):
    """Abstract base class for a metadata extractor

    This class defines the interface for all extractors in Scythe. Each new extractor must
    implement the :meth:`parse`, :meth:`version`, and :meth:`implementors` functions. The
    :meth:`group` method should be overridden to generate smart groups of file (e.g., associating
    the inputs and outputs to the same calculation) :meth:`citations` can be used if there
    are papers that should be cited if the extractor is used as part of a scientific publication.

    See the `Scythe Contributor Guide <contributor-guide.html>`_ for further details.
    """

[docs]    def identify_files(self, path: str, context: dict = None) -> \
            Iterator[Tuple[str]]:
        """Identify all groups of files likely to be compatible with this extractor

        Uses the :meth:`group` function to determine groups of files that should be parsed together.

        Args:
            path (str): Root of directory to group together
            context (dict): Context about the files
        Yields:
            ([str]) Groups of eligible files
        """

        # Walk through the directories
        for root, dirs, files in os.walk(path):
            # Generate the full paths
            dirs = [os.path.join(root, d) for d in dirs]
            files = [os.path.join(root, f) for f in files]

            # Get any groups from this directory
            for group in self.group(files, dirs, context):
                yield group

[docs]    def extract_directory(self, path: str, context: dict = None) -> \
            Iterator[Tuple[Tuple[str], dict]]:
        """Run extractor on all appropriate files in a directory

        Skips files that throw exceptions while parsing

        Args:
            path (str): Root of directory to extract metadata from
            context (dict): Context about the files
        Yields:
            ([str], dict): Tuple of the group identity and the metadata unit
        """

        for group in self.identify_files(path, context):
            try:
                metadata_unit = self.extract(group, context)
            except Exception:
                continue
            else:
                yield group, metadata_unit

[docs]    @abstractmethod
    def extract(self, group: Iterable[str], context: dict = None) -> dict:
        """Extract metadata from a group of files

        A group of files is a set of 1 or more files that describe the same object
        and will be used together to create a single metadata record.

        Arguments:
            group ([str]):  A list of one or more files that should be parsed together
            context (dict): Context about the files

        Returns:
            (dict): The parsed results, in JSON-serializable format.
        """

[docs]    def group(self, files: Union[str, List[str]], directories: List[str] = None,
              context: dict = None) -> Iterator[Tuple[str, ...]]:
        """Identify a groups of files and directories that should be parsed together

        Will create groups using only the files and directories included as input.

        The files of files are _all_ files that could be read by this extractor,
        which may include many false positives.

        Args:
            files (str or [str]): List of files to consider grouping
            directories ([str]): Any directories to consider group as well
            context (dict): Context about the files
        Yields:
            ((str)): Groups of files
        """

        # Make sure file paths are strings or Path-like objects
        files = preprocess_paths(files)

        # Default: Every file is in its own group
        for f in files:
            yield f,

[docs]    def citations(self) -> List[str]:
        """Citation(s) and reference(s) for this extractor

        Returns:
            ([str]): each element should be a string citation in BibTeX format
        """
        return []

[docs]    @abstractmethod
    def implementors(self) -> List[str]:
        """List of implementors of the extractor

        These people are the points-of-contact for addressing errors or modifying the extractor

        Returns:
            ([str]): List of implementors in the form "FirstName LastName <email@provider>"
        """

[docs]    @abstractmethod
    def version(self) -> str:
        """Return the version of the extractor

        Returns:
            (str): Version of the extractor
        """

    @property
    def schema(self) -> dict:
        """Schema for the output of the extractor"""
        return {
            "$schema": "http://json-schema.org/schema#"
        }


class BaseSingleFileExtractor(BaseExtractor):
    """Base class for extractors that only ever considers a single file at a time

    Instead of implementing :meth:`parse`, implement :meth:`_parse_file`"""

    @abstractmethod
    def _extract_file(self, path: str, context=None):
        """Generate the metadata for a single file

        Args:
            path (str): Path to the file
            context (dict): Optional context information about the file
        Returns:
            (dict): Metadata for the file
        """

    def extract(self, group: Union[str, Sequence[str]], context=None):
        # Error catching: allows for single files to passed not as list
        if isinstance(group, str):
            return self._extract_file(group, context)

        # Assumes that the group must have exactly one file
        if len(group) > 1:
            raise ValueError('Extractor only takes a single file at a time')

        return self._extract_file(group[0], context)