Skip to content
This repository has been archived by the owner on Dec 18, 2019. It is now read-only.

Commit

Permalink
Merge branch 'release-v33.3.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark Breedlove committed Jun 2, 2015
2 parents 4f41775 + eaa7bbc commit 9aebf3b
Show file tree
Hide file tree
Showing 16 changed files with 794 additions and 4 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ caches/*
akara.ini
akara.conf
akara.confc
ENV/*
ENV/*
couchdb.*
1 change: 1 addition & 0 deletions akara.conf.template
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ MODULES = [
"dplaingestion.akamod.filter_empty_values",
"dplaingestion.akamod.artstor_select_isshownat",
"dplaingestion.akamod.artstor_identify_object",
"dplaingestion.akamod.cdl_identify_object",
"dplaingestion.akamod.contentdm_identify_object",
"dplaingestion.akamod.move_date_values",
"dplaingestion.akamod.enrich_location",
Expand Down
55 changes: 55 additions & 0 deletions lib/akamod/cdl_identify_object.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from akara import logger
from akara import response
from akara.services import simple_service
from amara.thirdparty import json
from dplaingestion.selector import getprop, setprop, exists, delprop
from akara import module_config
from amara.lib.iri import is_absolute

@simple_service('POST', 'http://purl.org/la/dp/cdl_identify_object',
'cdl_identify_object', 'application/json')
def cdl_identify_object(body, ctype):
"""
Responsible for: adding a field to a document with the URL where we
should expect to the find the thumbnail.
"""

try:
data = json.loads(body)
except:
response.code = 500
response.add_header('content-type', 'text/plain')
return "Unable to parse body as JSON"

url = None
if exists(data, "object"):
handle = getprop(data, "object")
for h in (handle if not isinstance(handle, basestring) else [handle]):
if is_absolute(h):
url = h
break
if exists(data, "originalRecord/doc/isShownBy"):
handle = getprop(data, "originalRecord/doc/isShownBy")
for h in (handle if not isinstance(handle, basestring) else [handle]):
if is_absolute(h):
url = h
break

if url:
if 'content.cdlib.org' in url:
base_url, obj_id, object_type = url.rsplit("/", 2)
is_shown_at = getprop(data, "isShownAt")
is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1)
if obj_id != is_shown_at_id:
logger.warn("Object url for %s has ARK value (%s) that does not match isShownAt (%s)" % (data["_id"], obj_id, is_shown_at_id))
obj_id = is_shown_at_id
url = "/".join([base_url, obj_id, object_type])
if object_type == "hi-res":
setprop(data, "hasView", {"@id": url})
url = url.replace('hi-res', 'thumbnail')

setprop(data, "object", url)
else:
logger.warn("No url found for object in id %s" % data["_id"])
delprop(data, "object", True)
return json.dumps(data)
10 changes: 10 additions & 0 deletions lib/create_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ def _create_oai_verbs_fetcher(profile, uri_base, config_file):
from dplaingestion.fetchers.oai_verbs_fetcher import OAIVerbsFetcher
return OAIVerbsFetcher(profile, uri_base, config_file)

def _create_mdl_api_fetcher(profile, uri_base, config_file):
from dplaingestion.fetchers.mdl_api_fetcher import MDLAPIFetcher
return MDLAPIFetcher(profile, uri_base, config_file)

def _create_cdl_fetcher(profile, uri_base, config_file):
from dplaingestion.fetchers.cdl_fetcher import CDLFetcher
return CDLFetcher(profile, uri_base, config_file)

fetchers = {
'ia': lambda p, u, c: _create_ia_fetcher(p, u, c),
'uva': lambda p, u, c: _create_uva_fetcher(p, u, c),
Expand All @@ -52,6 +60,8 @@ def _create_oai_verbs_fetcher(profile, uri_base, config_file):
'getty': lambda p, u, c: _create_getty_fetcher(p, u, c),
'hathi': lambda p, u, c: _create_hathi_fetcher(p, u, c),
'oai_verbs': lambda p, u, c: _create_oai_verbs_fetcher(p, u, c),
'mdl': lambda p, u, c: _create_mdl_api_fetcher(p, u, c),
'cdl': lambda p, u, c: _create_cdl_fetcher(p, u, c)
}

with open(profile_path, "r") as f:
Expand Down
17 changes: 16 additions & 1 deletion lib/create_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ def _create_mdl_mapper(data):
from dplaingestion.mappers.mdl_mapper import MDLMapper
return MDLMapper(data)

def _create_cdl_json_mapper(data):
from dplaingestion.mappers.cdl_json_mapper import CDLJSONMapper
return CDLJSONMapper(data)

def _create_mapv3_json_mapper(data):
from dplaingestion.mappers.mapv3_json_mapper import MAPV3JSONMapper
return MAPV3JSONMapper(data)

def _create_mdl_json_mapper(data):
from dplaingestion.mappers.mdl_json_mapper import MDLJSONMapper
return MDLJSONMapper(data)

def _create_gpo_mapper(data):
from dplaingestion.mappers.gpo_mapper import GPOMapper
return GPOMapper(data)
Expand Down Expand Up @@ -99,7 +111,10 @@ def _create_missouri_mapper(data):
'digitalnc': lambda d: _create_digitalnc_mapper(d),
'uiuc_marc': lambda d: _create_uiuc_marc_mapper(d),
'dublin_core': lambda d: _create_dublin_core_mapper(d),
'missouri': lambda d: _create_missouri_mapper(d)
'missouri': lambda d: _create_missouri_mapper(d),
'mapv3_json': lambda d: _create_mapv3_json_mapper(d),
'mdl_json': lambda d: _create_mdl_json_mapper(d),
'cdl_json': lambda d: _create_cdl_json_mapper(d)
}

return mappers.get(mapper_type)(data)
187 changes: 187 additions & 0 deletions lib/fetchers/cdl_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import json
from dplaingestion.fetchers.fetcher import *

class CDLFetcher(Fetcher):
def __init__(self, profile, uri_base, config_file):
super(CDLFetcher, self).__init__(profile, uri_base, config_file)
self.total_records = None
self.endpoint_url_params = profile.get("endpoint_url_params")

def extract_content(self, content, url):
error = None
try:
parsed_content = json.loads(content)
except Exception, e:
error = "Error parsing content from URL %s: %s" % (url, e)
return error, content

if not self.total_records:
total_records_prop = "total_rows"
self.total_records = getprop(parsed_content, total_records_prop)

if parsed_content is None:
error = "Error, there is no content from " \
"URL %s" % url
return error, parsed_content

def cdl_extract_records(self, content):
error = None
if not self.total_records:
total_records_prop = "total_rows"
self.total_records = getprop(content, total_records_prop)
records = getprop(content, "rows")

if records:
records = iterify(records)
for record in records:
record["_id"] = getprop(record, "id")
self.get_collection_for_record(record)
else:
records = []
if not error:
error = "No records found in CDL content: %s" % content

return error, records


def request_records(self, content, set_id=None):
error, records = self.cdl_extract_records(content)
if error:
error = "Error at index %s: %s" % \
(self.endpoint_url_params["skip"],
error)

# Go on to the next start point
bulk_size = self.endpoint_url_params["limit"]
self.endpoint_url_params["skip"] += bulk_size
else:
self.endpoint_url_params["skip"] += len(records)
print "Fetched %s of %s" % (self.endpoint_url_params["skip"],
self.total_records)
request_more = (int(self.total_records) >
int(self.endpoint_url_params["skip"]))

yield error, records, request_more

def fetch_all_data(self, set):
"""A generator to yield batches of records fetched, and any errors
encountered in the process, via the self.response dictonary.
"""

request_more = True
while request_more:

error, content = self.request_content_from(
self.endpoint_url, self.endpoint_url_params
)
print "Requesting %s?%s" % (self.endpoint_url,
urlencode(self.endpoint_url_params,
True))

if error is not None:
# Stop requesting from this set
request_more = False
self.response["errors"].append(error)
break

error, content = self.extract_content(content,
self.endpoint_url)
if error is not None:
request_more = False
self.response["errors"].extend(iterify(error))
else:
for error, records, request_more in \
self.request_records(content):
if error is not None:
self.response["errors"].extend(iterify(error))
self.add_provider_to_item_records(records)
self.add_collection_to_item_records(records)
self.response["records"].extend(records)
if len(self.response["records"]) >= self.batch_size:
yield self.response
self.reset_response()

# Last yield
self.add_collection_records_to_response()
if self.response["errors"] or self.response["records"]:
yield self.response
self.reset_response()

def add_collection_records_to_response(self):
# Create records of ingestType "collection"
if self.collections:
self.response["records"].extend(self.collections.values())

def get_collection_for_record(self, record):
collections = getprop(record, "doc/sourceResource/collection")
data_provider = getprop(record, "doc/dataProvider")
if collections:
out_collections = []
for coll in filter(None, iterify(collections)):
coll_title = getprop(coll, "title")

if coll_title:
for title in filter(None, iterify(coll_title)):
if title not in self.collections:
self.add_to_collections(coll, data_provider)
out_collections.append(self.collections[title])
if len(out_collections) == 1:
return out_collections[0]
else:
return out_collections
else:
return None
else:
return None

def add_to_collections(self, coll, data_provider=None):
def _normalize(value):
"""Replaced whitespace with underscores"""
return value.replace(" ", "__")

if not isinstance(data_provider, basestring):
data_provider = self.contributor["name"]

couch_id_str = "%s--%s" % (data_provider, coll["title"])
_id = _normalize(
couch_id_builder(self.provider, couch_id_str)
)
id = hashlib.md5(_id.encode("utf-8")).hexdigest()
at_id = "http://dp.la/api/collections/" + id

coll_to_update = self._clean_collection(coll.copy())
coll_to_update.update({
"_id" : _id,
"id": id,
"@id": at_id,
"ingestType": "collection"
})

desc = coll_to_update.get("description")
if desc and len(desc) == 0:
coll_to_update.pop("description", None)
self.collections[coll_to_update["title"]] = coll_to_update


def add_collection_to_item_records(self, records):
for record in records:
collection = self.get_collection_for_record(record)
if collection:
record["collection"] = self._clean_collection(collection, True)
self._clean_collection(record)


def _clean_collection(self, collection, include_ids=False):
include = ['description', 'title']
if include_ids:
include.extend(['id', '@id'])
if isinstance(collection, list):
clean_collection = []
for coll in collection:
clean_collection.append(
{k:v for k, v in coll.items() if k in include}
)
else:
clean_collection = {k:v for k, v in collection.items() if
k in include}
return clean_collection
Loading

0 comments on commit 9aebf3b

Please sign in to comment.