Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

work adding import functionality using django-import-export #800

Merged
merged 26 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
14b480d
person import work
quadrismegistus Jan 16, 2024
726ee48
first attribute working
quadrismegistus Jan 16, 2024
2b17de0
both working
quadrismegistus Jan 16, 2024
5e897f7
condensing code
quadrismegistus Jan 16, 2024
20d7e08
minor
quadrismegistus Jan 16, 2024
cc4b970
adding countryadmin back
quadrismegistus Jan 16, 2024
e38652f
turning off index signals while importing
quadrismegistus Jan 16, 2024
66bd9c6
working now with shared set of cols
quadrismegistus Jan 17, 2024
d6b0209
quick fixes
quadrismegistus Jan 17, 2024
099c027
should be working now
quadrismegistus Jan 18, 2024
a05b9a4
new settings; export cols separated
quadrismegistus Jan 19, 2024
feec6bd
adding index logic
quadrismegistus Jan 19, 2024
9e810e8
indexing
quadrismegistus Jan 19, 2024
c6e86c1
update
quadrismegistus Jan 19, 2024
c5b45a3
all working now
quadrismegistus Jan 19, 2024
ae06676
tests for import and export at requests level
quadrismegistus Jan 22, 2024
10ee53c
minor fix
quadrismegistus Jan 22, 2024
239d979
remove redundant code
quadrismegistus Jan 22, 2024
305ae29
reformatting
quadrismegistus Jan 24, 2024
26e767d
reformatting
quadrismegistus Jan 24, 2024
75e0e4f
cleanup
quadrismegistus Jan 24, 2024
bda61a6
added unit test
quadrismegistus Jan 24, 2024
1c27716
ensuring indexing disabled after person admin testing
quadrismegistus Jan 25, 2024
3dcba17
ensuring indexing disabled after person admin testing 2
quadrismegistus Jan 25, 2024
c2d7df8
Cleanup viaf person save test method for new skip lookup config
rlskoeser Jan 25, 2024
b2dd870
cleanup and making import model resource extensible for books
quadrismegistus Jan 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ django-debug-toolbar
sphinx
wheel
pre-commit
wagtail-factories
wagtail-factories
139 changes: 136 additions & 3 deletions mep/people/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from django.utils.timezone import now
from tabular_export.admin import export_to_csv_response
from viapy.widgets import ViafWidget

from mep.accounts.admin import AddressInline
from mep.common.admin import (
CollapsedTabularInline,
Expand All @@ -27,6 +26,43 @@
Relationship,
RelationshipType,
)
from import_export.admin import (
ImportExportModelAdmin,
)
from import_export.resources import ModelResource
from import_export.widgets import ManyToManyWidget, Widget
from import_export.fields import Field
from parasolr.django.signals import IndexableSignalHandler

PERSON_IMPORT_COLUMNS = (
'slug',
'gender',
'nationalities'
)

PERSON_IMPORT_EXPORT_COLUMNS = (
'slug',
'name',
'birth_year',
'death_year',
'gender',
'nationalities',
'notes',
'start_year',
'end_year',
'mep_id',
'sort_name',
'viaf_id',
'is_organization',
'verified',
'title',
'profession',
'relations',
'public_notes',
'locations',
'updated_at',
'id',
)


class InfoURLInline(CollapsibleTabularInline):
Expand Down Expand Up @@ -82,7 +118,6 @@ class Meta:
)
}


class CountryAdmin(admin.ModelAdmin):
form = CountryAdminForm
list_display = ("name", "geonames_id", "code")
Expand Down Expand Up @@ -436,8 +471,106 @@ class Media:
]


class ExportPersonResource(ModelResource):
class Meta:
model = Person
fields = PERSON_IMPORT_EXPORT_COLUMNS
export_order = PERSON_IMPORT_EXPORT_COLUMNS


class PersonResource(ModelResource):
def __init__(self,*x,**y):
super().__init__(*x,**y)
# list to contain updated objects for batch indexing at end
self.objects_to_index = []

def before_import(self, dataset, *args, **kwargs):
# lower and camel_case headers
dataset.headers = [x.lower().replace(' ','_') for x in dataset.headers]

# turn off indexing temporarily
IndexableSignalHandler.disconnect()

# turn off viaf lookups
settings.SKIP_VIAF_LOOKUP = True

def before_import_row(self, row, **kwargs):
"""
Called on an OrderedDictionary of row attributes.
Opportunity to do quick string formatting as a
principle of charity to annotators before passing
values into django-import-export lookup logic.
"""
# just make sure nation has no string padding
row['nation'] = str(row.get('nation')).strip()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does django-import-export not handle this automatically?

I see "Strip whitespace when looking up ManyToMany fields (#668)" in the changelog for version 0.6 although not finding results elsewhere in the docs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, good call, this should be done automatically


# gender to one char
gstr = str(row.get('gender')).strip()
row['gender']=gstr[0].upper() if gstr else ''

def after_save_instance(self, instance, using_transactions, dry_run):
"""
Called when an instance either was or would be saved (depending on dry_run)
"""
self.objects_to_index.append(instance)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

surprised you need to collect these yourself, seems like something django-import-export should handle for you

can you use this 'store instance' option instead? https://django-import-export.readthedocs.io/en/latest/advanced_usage.html#access-full-instance-data

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried that. That gets us the instance object on after_import_row and after_save_instance, i.e. gives us the instance of a row as it's imported or saved, but does not store all the saved/updated/imported instances for access later. That means without storing the instances somewhere ourselves we can't use batch indexing, only index one object at a time.

return super().after_save_instance(instance, using_transactions, dry_run)

def after_import(self, dataset, result, using_transactions, dry_run, **kwargs):
"""
Called after importing, twice: once with dry_run==True (preview),
once dry_run==False. We report how many objects were updated and need to be indexed.
We only do so when dry_run is False.
"""
# run parent method
super().after_import(dataset, result, using_transactions, dry_run, **kwargs)

# report how many need indexing
print(f'indexing {len(self.objects_to_index)} objects, dry_run = {dry_run}')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make sure you clean this up before merging (converting to debug logging would be fine if you think it may be helpful in future)


# only continue if not a dry run
if not dry_run:
# re-enable indexing
IndexableSignalHandler.connect()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could it be this causing 1 or 2 of the failing tests? The signal handler ought to revert to its original condition (connected) but maybe it's not somehow and we need to ensure that

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ooh good catch, I bet you're right

Copy link
Contributor Author

@quadrismegistus quadrismegistus Jan 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe i should revert index signal status in a tearDown method on the testing class. and/or revert to previous value instead of assuming it was previously connected

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

possible confirmation - the tests pass when you run just the ones that are failing but fail when you run the whole test suite (i.e., the problem is the interaction between the tests as you've already figured out)

I don't think we currently have a way for you to know what the original condition was! (yet)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@quadrismegistus quick pr on parasolr to update the disconnect method to return a count of handlers disconnected; could use this to determine whether to reconnect Princeton-CDH/parasolr#84

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but it is pretty likely a test-only scenario, so your other solution may be simpler + sufficient


# index objects
if self.objects_to_index:
Person.index_items(self.objects_to_index)

# turn viaf lookups back on
settings.SKIP_VIAF_LOOKUP = False



# only customized fields need specifying here
nationalities = Field(
column_name='nationalities',
attribute='nationalities',
widget=ManyToManyWidget(Country, field='name', separator=';')
)

class Meta:
model = Person
fields = PERSON_IMPORT_COLUMNS
import_id_fields = ('slug',)
export_order = PERSON_IMPORT_COLUMNS
Copy link
Contributor Author

@quadrismegistus quadrismegistus Jan 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right now export only exports the columns that are imported. We can change this (though there's annoyingly not a simple way to do this it appears in dj-imp-exp) or just document that for now. For instance if we wanted to annotate profession we could add that to both and write a little profession = Field(...ForeignKey(..)) in the code above

Copy link
Contributor

@rlskoeser rlskoeser Jan 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like an ok limitation since that wasn't part of the scope you were working on

(seems like a weird limitation on their part, but something we can live with)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got this working in latest, we export most everything we display in the Person table, and import only what we need

skip_unchanged = True
report_skipped = True


class PersonAdminImportExport(PersonAdmin, ImportExportModelAdmin):
resource_class = PersonResource


def get_export_resource_class(self):
"""
Specifies the resource class to use for exporting,
so that separate fields can be exported than those imported
"""
return ExportPersonResource
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice! glad you figured out a solution



# enable default admin to see imported data
admin.site.register(Person, PersonAdmin)
admin.site.register(Person, PersonAdminImportExport)
admin.site.register(Country, CountryAdmin)
admin.site.register(Location, LocationAdmin)
admin.site.register(Profession, NamedNotableAdmin)
Expand Down
4 changes: 2 additions & 2 deletions mep/people/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
import logging
from string import punctuation

from django.conf import settings
from django.apps import apps
from django.contrib.contenttypes.fields import GenericRelation
from django.core.exceptions import MultipleObjectsReturned
Expand Down Expand Up @@ -533,7 +533,7 @@ def save(self, *args, **kwargs):
"""Adds birth and death dates if they aren't already set
and there's a viaf id for the record"""

if self.viaf_id and not self.birth_year and not self.death_year:
if not getattr(settings,'SKIP_VIAF_LOOKUP',False) and self.viaf_id and not self.birth_year and not self.death_year:
self.set_birth_death_years()

# if slug has changed, save the old one as a past slug
Expand Down
161 changes: 154 additions & 7 deletions mep/people/tests/test_admin.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,39 @@
from unittest.mock import Mock, patch

from io import StringIO
import csv
import os
import random
import tempfile
from django.contrib import admin
from datetime import date

from datetime import date, datetime
from django.http import HttpResponseRedirect
from django.test import TestCase
from django.test import TestCase, Client
from django.urls import reverse
from django.utils.timezone import now
from django.apps import apps

from mep.accounts.models import Account, Subscription
from mep.books.models import Creator, CreatorType, Work
from mep.people.admin import PersonAdmin, PersonTypeListFilter
from mep.people.models import Person, PastPersonSlug

from mep.people.admin import PersonAdmin, PersonTypeListFilter, PersonAdminImportExport, PERSON_IMPORT_EXPORT_COLUMNS, ExportPersonResource
from mep.people.models import Person, PastPersonSlug, Country
from django.conf import settings

class TestPersonAdmin(TestCase):
fixtures = ["sample_people"]

def setUp(self):
User = apps.get_model("auth", "User")
# script user needed for log entry logic
# store the password to login later
password = 'adminpass'
self.admin_user = User.objects.create_superuser('admin', 'admin@admin.com', password)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've had code like this flagged for including passwords in source code (even though it's test code). Was looking elsewhere to see what we've done in this code base - in newer projects we're using pytest-django with an admin client fixture but I don't think that's easy to use here.

One workaround (used elsewhere in mep-django) has been to generate a password so it isn't a hard-coded string. (You'll have to import uuid for this to work)

Suggested change
password = 'adminpass'
self.admin_user = User.objects.create_superuser('admin', 'admin@admin.com', password)
password = str(uuid.uuid4())
self.admin_user = User.objects.create_superuser('admin', 'admin@admin.com', password)

self.client = Client()
# You'll need to log him in before you can send requests through the client
self.client.login(username=self.admin_user.username, password=password)
self.url_person_import = '/admin/people/person/import/'
self.url_person_process_import = '/admin/people/person/process_import/'
self.url_person_export = '/admin/people/person/export/'

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's use named urls for these (weirdly not documented in django-i-e docs that I could find)

I think this should work:

Suggested change
self.url_person_import = '/admin/people/person/import/'
self.url_person_process_import = '/admin/people/person/process_import/'
self.url_person_export = '/admin/people/person/export/'
self.url_person_import = reverse("admin:people_person_import")
self.url_person_process_import = reverse("admin:people_person_process_import")
self.url_person_export = reverse("admin:people_person_export")

def test_merge_people(self):
mockrequest = Mock()
test_ids = ["5", "33", "101"]
Expand Down Expand Up @@ -99,6 +116,129 @@ def test_export_csv(self, mock_export_to_csv_response):
# or title case for property with no verbose name
assert "Is Creator" in headers


def _djangoimportexport_do_export_post(self, file_format=0):
response = self.client.post(self.url_person_export, {'file_format':str(file_format)})
return response


def test_djangoimportexport_export(self):
### test can get page
response = self.client.get(self.url_person_export)
self.assertEqual(response.status_code, 200)

### test can post to page and get csv data back
date_str = datetime.now().strftime("%Y-%m-%d")
response = self._djangoimportexport_do_export_post(file_format=0) # csv

# test response
self.assertEqual(response.status_code, 200)
self.assertTrue(response.has_header("Content-Disposition"))
self.assertEqual(response["Content-Type"], "text/csv")
self.assertEqual(
response["Content-Disposition"],
'attachment; filename="Person-{}.csv"'.format(date_str),
)

# test csv as binary string response
lines = response.content.splitlines()
assert len(lines)>0, 'no header returned'
self.assertEqual(
','.join(PERSON_IMPORT_EXPORT_COLUMNS).encode(),
lines[0],
)

# test csv via csv reader
f = StringIO(response.content.decode())
reader = csv.DictReader(f, delimiter=',')
rows = list(reader)
persons = Person.objects.all()

# test num lines, should be a row per person
assert len(rows) == len(persons)


# test values by row
person_admin = PersonAdminImportExport(model=Person, admin_site=admin.site)
export_class = person_admin.get_export_resource_class()
exporter = export_class()

def getstr(person,attr,default=''):
field = exporter.fields[attr]
res = exporter.export_field(field, person)
return str(res) if res or res==0 else default

for person,row in zip(persons,rows):
for attr in PERSON_IMPORT_EXPORT_COLUMNS:
self.assertEquals(getstr(person,attr), row[attr])


def _djangoimportexport_do_import_post(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add comments to make clear what this helper function is doing

self,
url,
filename,
input_format=0,
follow=False
):
with open(filename, "rb") as f:
data = {
"input_format": str(input_format),
"import_file": f,
}
response = self.client.post(url, data, follow=follow)
return response

def test_djangoimportexport_import(self):
### test can get page
response = self.client.get(self.url_person_import)
self.assertEqual(response.status_code, 200)
self.assertTemplateUsed(response, "admin/import_export/import.html")
self.assertContains(response, 'form action=""')

tmpfn = 'persons.csv'
## test import with changed data

with tempfile.TemporaryDirectory() as tmpdir:
csv_filename = os.path.join(tmpdir,tmpfn)
# quick export
response = self._djangoimportexport_do_export_post()
# modify
f = StringIO(response.content.decode())
reader = csv.DictReader(f, delimiter=',')
rows = list(reader)
countries = [c.name for c in Country.objects.all()]
for row in rows:
row['gender'] = random.choice([x for x,y in Person.GENDER_CHOICES if x!=row['gender']])
row['nationalities'] = random.choice([x for x in countries if x!=row['nationalities']])
# save
with open(csv_filename,'w') as of:
writer = csv.DictWriter(of, fieldnames=reader.fieldnames)
writer.writeheader()
writer.writerows(rows)

# now import
response = self._djangoimportexport_do_import_post(self.url_person_import, csv_filename)
self.assertEqual(response.status_code, 200)
self.assertIn("result", response.context)
self.assertFalse(response.context["result"].has_errors())
self.assertIn("confirm_form", response.context)
confirm_form = response.context["confirm_form"]

data = confirm_form.initial
self.assertEqual(data["original_file_name"], tmpfn)
response = self.client.post(self.url_person_process_import, data, follow=True)
self.assertEqual(response.status_code, 200)
self.assertContains(
response,
("Import finished, with {} new and {} updated {}.").format(
0, len(rows), Person._meta.verbose_name_plural
),
)

assert response.content.count(b'<tr class="grp-row') == len(rows)



def test_past_slugs_list(self):
person_admin = PersonAdmin(model=Person, admin_site=admin.site)
person = Person.objects.order_by("id").first()
Expand Down Expand Up @@ -170,3 +310,10 @@ def test_queryset(self):
assert foo in qs
assert not engelbert in qs
assert not humperdinck in qs


# New tests:
# `before_import_row
# import formatting methods for gender and nation (you'll need to adjust the nesting)
# get_import_fields
# Person.save optional behavior (skip viaf lookup)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make sure to clean up before merging.

5 changes: 4 additions & 1 deletion mep/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
"mep.books",
"mep.footnotes",
"mep.pages",
"import_export"
]

MIDDLEWARE = [
Expand Down Expand Up @@ -323,4 +324,6 @@
}
}

WAGTAILADMIN_BASE_URL = "https://shakespeareandco.princeton.edu/cms/"
WAGTAILADMIN_BASE_URL = "https://shakespeareandco.princeton.edu/cms/"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't needed since you're assuming false if it's unset, let's omit

Suggested change
SKIP_VIAF_LOOKUP = False

SKIP_VIAF_LOOKUP = False
Loading
Loading