Source code for scythe.csv
from scythe.base import BaseSingleFileExtractor
from tableschema.exceptions import CastError
from tableschema import Table
from typing import List
import logging
logger = logging.getLogger(__name__)
[docs]class CSVExtractor(BaseSingleFileExtractor):
"""Describe the contents of a comma-separated value (CSV) file
The context dictionary for the CSV parser includes several fields:
- ``schema``: Dictionary defining the schema for this dataset, following that of
FrictionlessIO
- ``na_values``: Any values that should be interpreted as missing
"""
def __init__(self, return_records=True, **kwargs):
"""
Args:
return_records (bool): Whether to return each row in the CSV file
Keyword:
All kwargs as passed to `TableSchema's infer <https://github.com/frictionlessdata/tableschema-py#infer>`_ method
"""
self.return_records = return_records
self.infer_kwargs = kwargs
def _extract_file(self, path: str, context=None):
# Set the default value
if context is None:
context = dict()
# Load in the table
table = Table(path, schema=context.get('schema', None))
# Infer the table's schema
table.infer(**self.infer_kwargs)
# Add missing values
if 'na_values' in context:
if not isinstance(context['na_values'], list):
raise ValueError('context["na_values"] must be a list')
table.schema.descriptor['missingValues'] = sorted(set([''] + context['na_values']))
table.schema.commit()
# Store the schema
output = {'schema': table.schema.descriptor}
# If desired, store the data
if self.return_records:
headers = table.schema.headers
records = []
failed_records = 0
for row in table.iter(keyed=False, cast=False):
try:
row = table.schema.cast_row(row)
except CastError:
failed_records += 1
# TODO (wardlt): Use json output from tableschema once it's supported
# https://github.com/frictionlessdata/tableschema-py/issues/213
records.append(eval(repr(dict(zip(headers, row)))))
if failed_records > 0:
logger.warning(f'{failed_records} records failed casting with schema')
output['records'] = records
return output
def implementors(self) -> List[str]:
return ['Logan Ward']
[docs] def citations(self) -> List[str]:
return ["https://github.com/frictionlessdata/tableschema-py"]
def version(self) -> str:
return '0.0.1'