Skip to content

Commit

Permalink
closes #17
Browse files Browse the repository at this point in the history
* imprint service was already used but not the german version
* linting
  • Loading branch information
csae8092 committed Dec 15, 2023
1 parent f8eba16 commit a9bb9f4
Show file tree
Hide file tree
Showing 10 changed files with 193 additions and 197 deletions.
24 changes: 14 additions & 10 deletions add_akon_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,41 @@
print("loading akon data into dataframe")
POSTKARTEN_DUMP = "https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.zip?inline=false"

print(f'downloading AKON Data from {POSTKARTEN_DUMP}')
print(f"downloading AKON Data from {POSTKARTEN_DUMP}")
r = urllib2.urlopen(POSTKARTEN_DUMP).read()
file = ZipFile(BytesIO(r))
cards_csv = file.open("akon_postcards_public_domain.csv")
cards_df = pd.read_csv(cards_csv, low_memory=False)
cards_df = cards_df[['geoname_id','download_link']].dropna().astype('str')
cards_df['geonames'] = cards_df['geoname_id'].apply(lambda x: "https://sws.geonames.org/{}/".format(str(x).replace('.0', '')))
cards_df = cards_df.drop_duplicates(subset='geoname_id', keep="first")
cards_df = cards_df[["geoname_id", "download_link"]].dropna().astype("str")
cards_df["geonames"] = cards_df["geoname_id"].apply(
lambda x: "https://sws.geonames.org/{}/".format(str(x).replace(".0", ""))
)
cards_df = cards_df.drop_duplicates(subset="geoname_id", keep="first")
# cards_df.to_csv('akon.csv', index=False)

doc = TeiReader(MASTER_FILE)
df = cards_df

print(f"remove any akon links from {MASTER_FILE}")
for bad in doc.any_xpath('.//tei:link[@target]'):
for bad in doc.any_xpath(".//tei:link[@target]"):
bad.getparent().remove(bad)

places = doc.any_xpath('.//tei:place')
places = doc.any_xpath(".//tei:place")

print(f"and now add akon links to {MASTER_FILE}")
for x in tqdm(places, total=len(places)):
try:
geonames = x.xpath('./tei:idno[@type="geonames"]/text()', namespaces=NAME_SPACES)[0]
geonames = x.xpath(
'./tei:idno[@type="geonames"]/text()', namespaces=NAME_SPACES
)[0]
except IndexError:
continue
try:
akon = df.loc[df['geonames'] == geonames]["download_link"].values[0]
akon = df.loc[df["geonames"] == geonames]["download_link"].values[0]
except IndexError:
continue
link_node = ET.Element("{http://www.tei-c.org/ns/1.0}link")
link_node.attrib['target'] = akon
link_node.attrib["target"] = akon
x.append(link_node)
doc.tree_to_file(MASTER_FILE)
print("done")
print("done")
16 changes: 8 additions & 8 deletions build_website.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import requests

print("fetch imprint")
r = requests.get("https://imprint.acdh.oeaw.ac.at/20818/")
r = requests.get("https://imprint.acdh.oeaw.ac.at/20818?locale=de")
imprint_text = r.text
imprint = """{% extends "templates/partials/base.j2" %}
{% block content %}
Expand All @@ -22,21 +22,21 @@
print("Hello, let's start building")

# Clear the existing HTML files in the "./html" folder
for x in glob.glob('./html/*.html'):
for x in glob.glob("./html/*.html"):
os.unlink(x)

# Check if there is an HTML file in ./templates/static
static_html_files = glob.glob('./templates/static/*.html')
static_html_files = glob.glob("./templates/static/*.html")
if static_html_files:
for html_file in static_html_files:
shutil.copy(html_file, f'./{os.path.basename(html_file)}')
print(f'Copied {html_file} to {os.path.basename(html_file)}')
shutil.copy(html_file, f"./{os.path.basename(html_file)}")
print(f"Copied {html_file} to {os.path.basename(html_file)}")


files = glob.glob('./templates/static/*.j2')
print('Building static content')
files = glob.glob("./templates/static/*.j2")
print("Building static content")
for x in files:
template = templateEnv.get_template(x)
_, tail = os.path.split(x)
with open(f'./html/{tail.replace(".j2", ".html")}', 'w') as f:
with open(f'./html/{tail.replace(".j2", ".html")}', "w") as f:
f.write(template.render({"objects": {}}))
61 changes: 39 additions & 22 deletions check_against_pmb.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,67 +11,82 @@
PMB_LISTPLACE = "https://raw.githubusercontent.com/arthur-schnitzler/schnitzler-entities/main/indices/listplace.xml"
POSTKARTEN_DUMP = "https://labs.onb.ac.at/gitlab/labs-team/raw-metadata/raw/master/akon_postcards_public_domain.zip?inline=false"

print(f'downloading AKON Data from {POSTKARTEN_DUMP}')
print(f"downloading AKON Data from {POSTKARTEN_DUMP}")
r = urllib2.urlopen(POSTKARTEN_DUMP).read()
file = ZipFile(BytesIO(r))
cards_csv = file.open("akon_postcards_public_domain.csv")
cards_df = pd.read_csv(cards_csv, low_memory=False)
cards_df = cards_df[['geoname_id','download_link']].dropna().astype('str')
cards_df['geonames'] = cards_df['geoname_id'].apply(lambda x: "https://sws.geonames.org/{}/".format(str(x).replace('.0', '')))
cards_df = cards_df.drop_duplicates(subset='geoname_id', keep="first")
cards_df = cards_df[["geoname_id", "download_link"]].dropna().astype("str")
cards_df["geonames"] = cards_df["geoname_id"].apply(
lambda x: "https://sws.geonames.org/{}/".format(str(x).replace(".0", ""))
)
cards_df = cards_df.drop_duplicates(subset="geoname_id", keep="first")


def fix_geonames(x):
try:
return get_normalized_uri(x)
except TypeError:
return x


print(f"loading PMB Places from {PMB_LISTPLACE}")
doc = TeiReader(PMB_LISTPLACE)
d = dict()
places = doc.any_xpath('.//tei:place')
places = doc.any_xpath(".//tei:place")
for x in tqdm(places, total=len(places)):
xml_id = x.attrib['{http://www.w3.org/XML/1998/namespace}id']
urls = [x.text for x in x.xpath('.//tei:idno[@type="URL"]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) if 'geonames' in x.text]
xml_id = x.attrib["{http://www.w3.org/XML/1998/namespace}id"]
urls = [
x.text
for x in x.xpath(
'.//tei:idno[@type="URL"]',
namespaces={"tei": "http://www.tei-c.org/ns/1.0"},
)
if "geonames" in x.text
]
if urls:
for u in urls:
d[get_normalized_uri(u)] = xml_id

pmb_list = []
for key, value in d.items():
item = {}
item['geonames'] = key
item["geonames"] = key
url_pattern = "https://pmb.acdh.oeaw.ac.at/apis/entities/entity/place/{}/detail"
item['pmb_id'] = url_pattern.format(value.replace('place__', ''))
item["pmb_id"] = url_pattern.format(value.replace("place__", ""))
pmb_list.append(item)
pmb_df = pd.DataFrame(pmb_list)

df = pd.read_csv(SCHNITZLER_ORTE_CSV)
df['geonames'] = df['desc/placeName/_ref'].apply(lambda x: fix_geonames(x))
with_pmb = pd.merge(df, pmb_df, on=['geonames'], how='left')
with_pmb.to_csv('finalized-files/places_with_pmb.csv')

with_akon = pd.merge(with_pmb, cards_df, on=['geonames'], how='left')
with_akon.to_csv('finalized-files/places_with_pmb_and_akon.csv')
df["geonames"] = df["desc/placeName/_ref"].apply(lambda x: fix_geonames(x))
with_pmb = pd.merge(df, pmb_df, on=["geonames"], how="left")
with_pmb.to_csv("finalized-files/places_with_pmb.csv")

with_akon = pd.merge(with_pmb, cards_df, on=["geonames"], how="left")
with_akon.to_csv("finalized-files/places_with_pmb_and_akon.csv")


all_uris = "./finalized-files/places_with_pmb_and_akon.csv"
main_file = "./finalized-files/transformed-xml/19-strukturiert-tagesgenau.xml"

print(f"write pmb/akon ids into {main_file} using {all_uris} as input")
print(f"write pmb/akon ids into {main_file} using {all_uris} as input")

df_dict = pd.read_csv(all_uris).drop_duplicates(subset='geonames', keep="first").set_index('geonames').to_dict('index')
df_dict = (
pd.read_csv(all_uris)
.drop_duplicates(subset="geonames", keep="first")
.set_index("geonames")
.to_dict("index")
)
doc = TeiReader(main_file)
places = doc.any_xpath('.//tei:place')
places = doc.any_xpath(".//tei:place")
no_match = set()
for x in tqdm(places, total=len(places)):
geonames = x.xpath('.//*[@type="geonames"]')[0].text
fixed_geonames = get_normalized_uri(geonames)
x.xpath('.//*[@type="geonames"]')[0].text = fixed_geonames
try:
pmb = df_dict[fixed_geonames]['pmb_id']
akon = df_dict[fixed_geonames]['download_link']
pmb = df_dict[fixed_geonames]["pmb_id"]
akon = df_dict[fixed_geonames]["download_link"]
except KeyError:
no_match.add(
f"{geonames}__{x.xpath('.//tei:placeName/text()', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})[0]}"
Expand All @@ -80,8 +95,10 @@ def fix_geonames(x):
if isinstance(pmb, str):
x.xpath('.//*[@type="pmb"]')[0].text = pmb
if isinstance(akon, str):
x.xpath('.//tei:link[@target]', namespaces={'tei':"http://www.tei-c.org/ns/1.0"})[0].attrib['target'] = akon
x.xpath(
".//tei:link[@target]", namespaces={"tei": "http://www.tei-c.org/ns/1.0"}
)[0].attrib["target"] = akon
doc.tree_to_file(main_file)
print("no geonames-matches found for:")
for x in no_match:
print(x)
print(x)
13 changes: 4 additions & 9 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,8 @@
}

# places with this names should be ignored
FILTER_WORDS = [
"hotel",
"restaurant",
"gasthaus",
"straße",
"gasse",
"allee"
]
FILTER_WORDS = ["hotel", "restaurant", "gasthaus", "straße", "gasse", "allee"]

PMB_LISTPLACE_DUMP = "https://oeawcloud.oeaw.ac.at/index.php/s/Kbnien5KfnPaFsK/download/listplace.xml"
PMB_LISTPLACE_DUMP = (
"https://oeawcloud.oeaw.ac.at/index.php/s/Kbnien5KfnPaFsK/download/listplace.xml"
)
18 changes: 9 additions & 9 deletions enrich_from_pmb.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
doc.tree_to_file("hansi.xml")

data = {}
for x in doc.any_xpath('.//tei:place[@xml:id]'):
for x in doc.any_xpath(".//tei:place[@xml:id]"):
for y in x.xpath('./tei:idno[@subtype="pmb"]/text()', namespaces=NAME_SPACES):
if y.endswith('/'):
if y.endswith("/"):
data[y] = {}
else:
data[f"{y}/"] = {}
for idno in x.xpath('./tei:idno', namespaces=NAME_SPACES):
for idno in x.xpath("./tei:idno", namespaces=NAME_SPACES):
try:
domain = idno.attrib['subtype']
domain = idno.attrib["subtype"]
except KeyError:
print(f"no idno type subtype for {y}")
continue
Expand All @@ -26,7 +26,7 @@
elif domain == "geonames":
continue
else:
if y.endswith('/'):
if y.endswith("/"):
data[y][domain] = uri
else:
data[f"{y}/"][domain] = uri
Expand All @@ -41,10 +41,10 @@
pmb = x.xpath('./tei:idno[@type="pmb"]/text()', namespaces=NAME_SPACES)[0]
match = data[pmb]
for key, value in match.items():
idno = ET.Element('{http://www.tei-c.org/ns/1.0}idno')
idno.attrib['type'] = "website"
idno.attrib['subtype'] = key.replace('-', '_')
idno = ET.Element("{http://www.tei-c.org/ns/1.0}idno")
idno.attrib["type"] = "website"
idno.attrib["subtype"] = key.replace("-", "_")
idno.text = value
x.append(idno)
doc.tree_to_file(MASTER_ENRICHED)
print("done")
print("done")
Loading

0 comments on commit a9bb9f4

Please sign in to comment.