Source code for scythe.utils.grouping

"""Utilities for implementing grouping operations"""
from typing import Union, List, Iterable, Tuple
from operator import itemgetter
from pathlib import Path
import itertools
import os


[docs]def preprocess_paths(paths: Union[str, Path, List[str], List[Path]]) -> List[str]: """Transform paths to absolute paths Designed to be used to simplify grouping logic Args: paths (Union[str, List[str]): Files and directories to be parsed Returns: (List[str]): List of paths in standardized form """ # Make sure paths are strings or Path-like objects if isinstance(paths, (str, Path)): paths = [paths] # Make paths absolute return [os.path.abspath(os.path.expanduser(f)) for f in paths]
[docs]def group_by_postfix(files: Iterable[str], vocabulary: List[str]) -> Iterable[Tuple[str, ...]]: """Group files that have a common ending Finds all filenames that begin with a prefixes from a user-provided vocabulary and end with the same post-fix. For example, consider a directory that contains files A.1, B.1, A.2, B.2, and C.1. If a user provides a vocabulary of ['A', 'B'], the parser will return groups (A.1, B.1) and (A.2, B.2). If a user provides a vocabulary of ['A', 'B', 'C'], the parser will return groups (A.1, B.1), (A.2, B.2), and (C.1) See :class:`scythe.dft.DFTParser` for an example usage. Args: files ([str]): List of files to be grouped vocabulary ([str]): List of known starts for the file Yields: ([str]): Groups of files to be parsed together """ # TODO (lw): This function could be more flexible, but let's add features on demand # Get the files with similar post-fixes and are from the user-defined vocabulary matchable_files = [] # List of (path, type, (dir, postfix)) for filename in files: # Find if the filename matches a known type name = os.path.basename(filename) name_lower = name.lower() matches = [name_lower.startswith(n) for n in vocabulary] if not any(matches): continue # Get the extension of the file match_id = matches.index(True) vtype = vocabulary[match_id] ext = name[len(vtype):] d = os.path.dirname(filename) # Add to the list matchable_files.append((filename, vtype, (d, ext))) # Group files by postfix type and directory sort_key = itemgetter(2) for k, group in itertools.groupby(sorted(matchable_files, key=sort_key), key=sort_key): yield [x[0] for x in group]