Skip to content

Commit

Permalink
fix dump script
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanhb committed Feb 9, 2024
1 parent cbf4f4b commit e46cef1
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 0 deletions.
130 changes: 130 additions & 0 deletions scripts/fix_dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!python
# Copyright (c) 2022 The OpenCitations Index Authors.
#
# Permission to use, copy, modify, and/or distribute this software for any purpose
# with or without fee is hereby granted, provided that the above copyright notice
# and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
# SOFTWARE.

import multiprocessing
import os
import time
import csv
import redis
from zipfile import ZipFile
import json
import io
import re

from tqdm import tqdm
from argparse import ArgumentParser
from urllib.parse import quote
from datetime import datetime, timezone
from collections import defaultdict

_config = get_config()

def create_dirs(path_to_create):
# Split the path into individual directories
dirs = path_to_create.split(os.path.sep)
for directory in dirs:
current_path = os.path.join(current_path, directory)
if not os.path.exists(current_path):
os.mkdir(current_path)

def fix_dump(input_dir, output_dir, f_rm_cits, f_add_cits):

oci_rm_set = set()
with open(f_rm_cits, 'r') as csvfile:
reader = csv.reader(csvfile)
next(reader, None)
for row in reader:
oci_rm_set.add(row[0])

oci_add_set = set()
with open(f_add_cits, 'r') as csvfile:
reader = csv.reader(csvfile)
next(reader, None)
for row in reader:
oci_add_set.add(row[0])

files_to_process = []
for root, dirs, files in os.walk(input_dir):

#create dir of file if does not exist
out_f_dest = output_dir+"fix_dump/"+root.replace(input_dir,"")
create_dirs(out_f_dest)

for file in files:
if file.endswith('.ttl'):

# process input file
new_file_lines = []
with open(os.path.join(input_dir, file), 'r') as ttl_file:
lines = ttl_file.readlines()
for line in lines:
if line.strip() != "":
oci_pattern = r"https://w3id.org/oc/index/ci/(\d{1,}-\d{1,})"
oci = re.search(oci_pattern, line)
if oci:
oci = oci.group(1)
if oci not in oci_rm_set:
new_file_lines.append(line)

# produce corresponding new file
with open(out_f_dest+"/"+file, 'w', newline='') as new_file:
for line in lines:
new_file.write(line + '\n')

def main():
global _config

arg_parser = ArgumentParser(description="Normalize the data of OpenCitations Index")
arg_parser.add_argument(
"-i",
"--input",
required=True,
help="The input directory contatining the original files in TTL",
)
arg_parser.add_argument(
"-o",
"--output",
default="out",
help="The destination directory to save outputs",
)
arg_parser.add_argument(
"-rmc",
"--rmcits",
required=True,
default=None,
help="Remove citations from the RDF (TTL) dump of Index; it needs a CSV file such that each row contains an OCI to be removed from the dump",
)
arg_parser.add_argument(
"-addc",
"--addcits",
required=True,
default=None,
help="Add citations to the RDF (TTL) dump of Index; it needs a CSV file such that each row contains an OCI to be added as new citation in INDEX dump",
)

args = arg_parser.parse_args()

# input directory/file
input_dir = args.input + "/" if args.input[-1] != "/" else args.input

# output directory
output_dir = args.output + "/" if args.output[-1] != "/" else args.output
if not os.path.exists(output_dir):
os.makedirs(output_dir)

# call the fix_dump function
fix_dump(input_dir, output_dir, args.rmcits, args.addcits)

print("Done !!")
4 changes: 4 additions & 0 deletions scripts/norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def normalize_dump(input_dir, output_dir, mapping_file):
duplicated_omid = duplicated_omid.split("https://w3id.org/oc/meta/br/")[1]
omid_mapper[duplicated_omid] = correct_omid

#omid_mapper = a dictionary which maps all the duplicated OMIDs with their corresponding **correct OMID**

files_to_process = []
for root, dirs, files in os.walk(input_dir):
for file in files:
Expand Down Expand Up @@ -132,6 +134,8 @@ def normalize_dump(input_dir, output_dir, mapping_file):

#check if the citation has been modified
if new_oci != oci:
# then the original oci must be invalidated
# and a corresponding new citation *new_oci* (with the correct OCIs) must be considered instead
invalidated_cits.append(oci)
valid_cits.append(new_oci)

Expand Down

0 comments on commit e46cef1

Please sign in to comment.