Source code for scythe.utils.interface
"""Utilities for working with extractors from other applications"""
from stevedore.extension import ExtensionManager
from stevedore.driver import DriverManager
from typing import Iterator, Union, Dict, List
from collections import namedtuple
from copy import deepcopy
from scythe.adapters.base import BaseAdapter
from scythe.base import BaseExtractor
import logging
logger = logging.getLogger(__name__)
ExtractResult = namedtuple('ExtractResult', ['group', 'extractor', 'metadata'])
def _output_plugin_info(mgr: ExtensionManager) -> dict:
"""Gets information about all plugins attached to a particular manager
Args:
mgr (ExtensionManager): Plugin manager
Returns:
(dict): Dictionary where keys are plugin ids and values are descriptions
"""
output = {}
for name, ext in mgr.items():
plugin = ext.plugin()
output[name] = {
'description': plugin.__doc__.split("\n")[0],
'version': plugin.version(),
'class': ext.entry_point_target
}
return output
[docs]def get_available_extractors():
"""Get information about the available extractors
Returns:
[dict]: Descriptions of available extractors
"""
mgr = ExtensionManager(
namespace='scythe.extractor',
)
# Get information about each extractor
return _output_plugin_info(mgr)
[docs]def get_available_adapters() -> dict:
"""Get information on all available adapters
Returns:
(dict) Where keys are adapter names and values are descriptions
"""
return _output_plugin_info(ExtensionManager(namespace='scythe.adapter'))
def _get_adapter_map(adapter_map: str, extractors: list) -> dict:
"""Helper function to generate 'adapter map'
Adapter map is a list of extractors and names of the appropriate adapters
to use to format their output.
Args:
adapter_map (str): string argument for adapters.
- 'match' means just find adapters with same names as corresponding extractors.
extractors ([str]): list of extractors
Returns:
(dict) where keys are adapter names extractor/adapter names and values are adapter objects.
"""
if adapter_map is None:
adapter_map = {}
elif adapter_map == 'match':
adapters = get_available_adapters()
adapter_map = dict((x, x) for x in extractors if x in adapters)
elif not isinstance(adapter_map, dict):
raise ValueError('Adapter map must be a dict, None, or `matching`')
# Give it to the user
return adapter_map
[docs]def get_extractor_and_adapter_contexts(name, global_context, extractor_context, adapter_context):
"""
Helper function to update the helper and adapter contexts and the 'name'
of a extractor/adapter pair
Args:
name (str): adapter/extractor name.
global_context (dict): Context of the files, used for every extractor and adapter
adapter_context (dict): Context used for adapters. Key is the name of the adapter,
value is the context. The key ``@all`` is used to for context used for every adapter
extractor_context (dict): Context used for adapters. Key is the name of the extractor,
value is the context. The key ``@all`` is used to for context used for every extractor
Returns:
(dict, dict): extractor_context, my_adapter context tuple
"""
# Get the context information for the extractor and adapter
my_extractor_context = deepcopy(global_context)
my_extractor_context.update(extractor_context.get('@all', {}))
my_extractor_context.update(extractor_context.get(name, {}))
my_adapter_context = deepcopy(global_context)
my_adapter_context.update(adapter_context.get('@all', {}))
my_adapter_context.update(adapter_context.get(name, {}))
return my_extractor_context, my_adapter_context
def _get_extractor_list(to_include: list, to_exclude: list) -> list:
""" Helper function to get a list of extractors given lists of extractors to include/exclude
Args:
to_include ([str]): Predefined list of extractors to run. Only these will be used.
Mutually exclusive with `exclude_extractors`.
to_exclude ([str]): List of extractors to exclude.
Mutually exclusive with `include_extractors`.
Returns:
List of all applicable extractors
"""
extractors = get_available_extractors()
if to_include is not None and to_exclude is not None:
raise ValueError('Including and excluding extractors are mutually exclusive')
elif to_include is not None:
missing_extractors = set(to_include).difference(extractors.keys())
if len(missing_extractors) > 0:
raise ValueError('Some extractors are missing: ' + ' '.join(missing_extractors))
extractors = to_include
elif to_exclude is not None:
extractors = list(set(extractors.keys()).difference(to_exclude))
return extractors
[docs]def get_extractor(name: str) -> BaseExtractor:
"""Load an extractor object
Args:
name (str): Name of extractor
Returns:
Requested extractor
"""
return DriverManager(
namespace='scythe.extractor',
name=name,
invoke_on_load=True
).driver
[docs]def get_adapter(name: str) -> BaseAdapter:
"""Load an adapter
Args:
name (str): Name of adapter
Returns:
(BaseAdapter) Requested adapter
"""
# Load the adapter
mgr = DriverManager(
namespace='scythe.adapter',
name=name,
invoke_on_load=True
)
# Give it to the user
return mgr.driver
[docs]def run_extractor(name, group, context=None, adapter=None):
"""Invoke a extractor on a certain group of files
Args:
name (str): Name of the extractor
group ([str]): Paths to group of files to be parsed
context (dict): Context of the files, used in adapter and extractor
adapter (str): Name of adapter to use to transform metadata
Returns:
([dict]): Metadata generated by the extractor
"""
metadata = get_extractor(name).extract(group, context)
if adapter is not None:
adapter = get_adapter(adapter)
return adapter.transform(metadata, context=context)
return metadata
[docs]def run_all_extractors_on_directory(directory: str, global_context=None,
adapter_context: Union[None, dict] = None,
extractor_context: Union[None, dict] = None,
include_extractors: Union[None, List[str]] = None,
exclude_extractors: Union[None, List] = None,
adapter_map: Union[None, str, Dict[str, str]] = None,
default_adapter: Union[None, str] = None) \
-> Iterator[ExtractResult]:
"""Run all known files on a directory of files
Args:
directory (str): Path to directory to be parsed
global_context (dict): Context of the files, used for every extractor and adapter
adapter_context (dict): Context used for adapters. Key is the name of the adapter,
value is the context. The key ``@all`` is used to for context used for every adapter
extractor_context (dict): Context used for adapters. Key is the name of the extractor,
value is the context. The key ``@all`` is used to for context used for every extractor
include_extractors ([str]): Predefined list of extractors to run. Only these will be used.
Mutually exclusive with `exclude_extractors`.
exclude_extractors ([str]): List of extractors to exclude.
Mutually exclusive with `include_extractors`.
adapter_map (str, dict): Map of extractor name to the desired adapter.
Use 'match' to find adapters with the same names
default_adapter (str): Adapter to use if no other adapter is defined
Yields
((str), str, dict) Tuple of (1) group of files, (2) name of extractor, (3) metadata
"""
# Load in default arguments
if global_context is None:
global_context = dict()
if adapter_context is None:
adapter_context = dict()
if extractor_context is None:
extractor_context = dict()
# Get the list of extractors
extractors = _get_extractor_list(include_extractors, exclude_extractors)
# Make the adapter map
adapter_map = _get_adapter_map(adapter_map=adapter_map, extractors=extractors)
# Get the list of known extractors
for name in extractors:
# Get the extractor and adapter
extractor = get_extractor(name)
adapter_name = adapter_map.get(name, default_adapter)
if adapter_name is not None:
adapter = get_adapter(adapter_name)
else:
adapter = None
my_extractor_context, my_adapter_context = get_extractor_and_adapter_contexts(name,
global_context,
extractor_context,
adapter_context)
for group, metadata in extractor.extract_directory(directory, context=my_extractor_context):
# Run the adapter, if defined
if adapter is not None:
try:
metadata = adapter.transform(metadata, my_adapter_context)
except Exception as e:
logger.warning(f'Adapter for {extractor} failed with caught exception: {e}')
continue
if metadata is None:
continue
yield ExtractResult(group, name, metadata)
[docs]def run_all_extractors_on_group(group,
adapter_map=None,
global_context=None,
adapter_context: Union[None, dict] = None,
extractor_context: Union[None, dict] = None,
include_extractors: Union[None, List[str]] = None,
exclude_extractors: Union[None, List] = None,
default_adapter: Union[None, str] = None):
"""
Parse metadata from a file-group and adapt its metadata per a user-supplied adapter_map.
This function is effectively a wrapper to execute_extractor() that enables us to output metadata
in the same format as run_all_extractors_on_directory(), but just on a single file group.
Args:
group ([str]): Paths to group of files to be parsed
global_context (dict): Context of the files, used for every extractor and adapter
adapter_context (dict): Context used for adapters. Key is the name of the adapter,
value is the context. The key ``@all`` is used to for context used for every adapter
extractor_context (dict): Context used for adapters. Key is the name of the extractor,
value is the context. The key ``@all`` is used to for context used for every extractor
include_extractors ([str]): Predefined list of extractors to run. Only these will be used.
Mutually exclusive with `exclude_extractors`.
exclude_extractors ([str]): List of extractors to exclude.
Mutually exclusive with `include_extractors`.
adapter_map (str, dict): Map of extractor name to the desired adapter.
Use 'match' to find adapters with the same names:
default_adapter:
Yields:
Metadata for a certain
"""
# Load in default arguments
if global_context is None:
global_context = dict()
if adapter_context is None:
adapter_context = dict()
if extractor_context is None:
extractor_context = dict()
# Get the list of extractors
extractors = _get_extractor_list(include_extractors, exclude_extractors)
# Make the adapter map
adapter_map = _get_adapter_map(adapter_map=adapter_map, extractors=extractors)
for name in extractors:
# Get the extractor and adapter
adapter_name = adapter_map.get(name, default_adapter)
my_extractor_context, my_adapter_context = get_extractor_and_adapter_contexts(name,
global_context,
extractor_context,
adapter_context)
metadata = run_extractor(name, group, context=my_extractor_context, adapter=adapter_name)
yield ExtractResult(group, name, metadata)