wuttasync/src/wuttasync/importing/csv.py
Lance Edgar 328f8d9952 fix: implement deletion logic; add cli params for max changes
also add special UUID field handling for CSV -> SQLAlchemy ORM, to
normalize string from CSV to proper UUID so key matching works
2024-12-06 15:18:23 -06:00

291 lines
9.8 KiB
Python

# -*- coding: utf-8; -*-
################################################################################
#
# WuttaSync -- Wutta Framework for data import/export and real-time sync
# Copyright © 2024 Lance Edgar
#
# This file is part of Wutta Framework.
#
# Wutta Framework is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# Wutta Framework is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# Wutta Framework. If not, see <http://www.gnu.org/licenses/>.
#
################################################################################
"""
Importing from CSV
"""
import csv
import logging
import uuid as _uuid
from collections import OrderedDict
from sqlalchemy_utils.functions import get_primary_keys
from wuttjamaican.db.util import make_topo_sortkey, UUID
from .base import FromFile
from .handlers import FromFileHandler
from .wutta import ToWuttaHandler
from .model import ToWutta
log = logging.getLogger(__name__)
class FromCsv(FromFile):
"""
Base class for importer/exporter using CSV file as data source.
Note that this assumes a particular "format" for the CSV files.
If your needs deviate you should override more methods, e.g.
:meth:`open_input_file()`.
The default logic assumes CSV file is mostly "standard" - e.g.
comma-delimited, UTF-8-encoded etc. But it also assumes the first
line/row in the file contains column headers, and all subsequent
lines are data rows.
.. attribute:: input_reader
While the input file is open, this will reference a
:class:`python:csv.DictReader` instance.
"""
csv_encoding = 'utf_8'
"""
Encoding used by the CSV input file.
You can specify an override if needed when calling
:meth:`~wuttasync.importing.handlers.ImportHandler.process_data()`.
"""
def get_input_file_name(self):
"""
By default this returns the importer/exporter model name plus
CSV file extension, e.g. ``Widget.csv``
It calls
:meth:`~wuttasync.importing.base.Importer.get_model_title()`
to obtain the model name.
"""
if hasattr(self, 'input_file_name'):
return self.input_file_name
model_title = self.get_model_title()
return f'{model_title}.csv'
def open_input_file(self):
"""
Open the input file for reading, using a CSV parser.
This tracks the file handle via
:attr:`~wuttasync.importing.base.FromFile.input_file` and the
CSV reader via :attr:`input_reader`.
It also updates the effective
:attr:`~wuttasync.importing.base.Importer.fields` list per the
following logic:
First get the current effective field list, e.g. as defined by
the class and/or from caller params. Then read the column
header list from CSV file, and discard any which are not found
in the first list. The result becomes the new effective field
list.
"""
path = self.get_input_file_path()
log.debug("opening input file: %s", path)
self.input_file = open(path, 'rt', encoding=self.csv_encoding)
self.input_reader = csv.DictReader(self.input_file)
# nb. importer may have all supported fields by default, so
# must prune to the subset also present in the input file
fields = self.get_fields()
orientation = self.orientation.value
log.debug(f"supported fields for {orientation}: %s", fields)
self.fields = [f for f in self.input_reader.fieldnames or []
if f in fields]
log.debug("fields present in source data: %s", self.fields)
if not self.fields:
self.input_file.close()
raise ValueError("input file has no recognized fields")
def close_input_file(self):
""" """
self.input_file.close()
del self.input_reader
del self.input_file
def get_source_objects(self):
"""
This returns a list of data records "as-is" from the CSV
source file (via :attr:`input_reader`).
Since this uses :class:`python:csv.DictReader` by default,
each record will be a dict with key/value for each column in
the file.
"""
return list(self.input_reader)
class FromCsvToSqlalchemyMixin:
"""
Mixin class for CSV → SQLAlchemy ORM :term:`importers <importer>`.
Meant to be used by :class:`FromCsvToSqlalchemyHandlerMixin`.
This mixin adds some logic to better handle ``uuid`` key fields
which are of :class:`~wuttjamaican:wuttjamaican.db.util.UUID` data
type (i.e. on the target side). Namely, when reading ``uuid``
values as string from CSV, convert them to proper UUID instances,
so the key matching between source and target will behave as
expected.
"""
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
# nb. keep track of any key fields which use proper UUID type
self.uuid_keys = []
for field in self.get_keys():
attr = getattr(self.model_class, field)
if len(attr.prop.columns) == 1:
if isinstance(attr.prop.columns[0].type, UUID):
self.uuid_keys.append(field)
def normalize_source_object(self, obj):
""" """
data = dict(obj)
# nb. convert to proper UUID values so key matching will work
# properly, where applicable
for key in self.uuid_keys:
uuid = data[key]
if uuid and not isinstance(uuid, _uuid.UUID):
data[key] = _uuid.UUID(uuid)
return data
class FromCsvToSqlalchemyHandlerMixin:
"""
Mixin class for CSV → SQLAlchemy ORM :term:`import handlers
<import handler>`.
This knows how to dynamically generate :term:`importer` classes to
target the particular ORM involved. Such classes will inherit
from :class:`FromCsvToSqlalchemyMixin`, in addition to whatever
:attr:`FromImporterBase` and :attr:`ToImporterBase` reference.
This all happens within :meth:`define_importers()`.
"""
source_key = 'csv'
generic_source_title = "CSV"
FromImporterBase = FromCsv
"""
This must be set to a valid base class for the CSV source side.
Default is :class:`FromCsv` which should typically be fine; you
can change if needed.
"""
# nb. subclass must define this
ToImporterBase = None
"""
For a handler to use this mixin, this must be set to a valid base
class for the ORM target side. The :meth:`define_importers()`
logic will use this as base class when dynamically generating new
importer/exporter classes.
"""
def get_target_model(self):
"""
This should return the :term:`app model` or a similar module
containing data model classes for the target side.
The target model is used to dynamically generate a set of
importers (e.g. one per table in the target DB) which can use
CSV file as data source. See also :meth:`define_importers()`.
Subclass must override this if needed; default behavior is not
implemented.
"""
raise NotImplementedError
def define_importers(self):
"""
This mixin overrides typical (manual) importer definition, and
instead dynamically generates a set of importers, e.g. one per
table in the target DB.
It does this based on the target model, as returned by
:meth:`get_target_model()`. It calls
:meth:`make_importer_factory()` for each model class found.
"""
importers = {}
model = self.get_target_model()
# mostly try to make an importer for every data model
for name in dir(model):
cls = getattr(model, name)
if isinstance(cls, type) and issubclass(cls, model.Base) and cls is not model.Base:
importers[name] = self.make_importer_factory(cls, name)
# sort importers according to schema topography
topo_sortkey = make_topo_sortkey(model)
importers = OrderedDict([
(name, importers[name])
for name in sorted(importers, key=topo_sortkey)
])
return importers
def make_importer_factory(self, model_class, name):
"""
Generate and return a new :term:`importer` class, targeting
the given :term:`data model` class.
The newly-created class will inherit from:
* :class:`FromCsvToSqlalchemyMixin`
* :attr:`FromImporterBase`
* :attr:`ToImporterBase`
:param model_class: A data model class.
:param name: The "model name" for the importer/exporter. New
class name will be based on this, so e.g. ``Widget`` model
name becomes ``WidgetImporter`` class name.
:returns: The new class, meant to process import/export
targeting the given data model.
"""
return type(f'{name}Importer',
(FromCsvToSqlalchemyMixin, self.FromImporterBase, self.ToImporterBase), {
'model_class': model_class,
'key': list(get_primary_keys(model_class)),
})
class FromCsvToWutta(FromCsvToSqlalchemyHandlerMixin, FromFileHandler, ToWuttaHandler):
"""
Handler for CSV → Wutta :term:`app database` import.
This uses :class:`FromCsvToSqlalchemyHandlerMixin` for most of the
heavy lifting.
"""
ToImporterBase = ToWutta
def get_target_model(self):
""" """
return self.app.model