-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheckPuncuationOfStrings.py
75 lines (63 loc) · 2.1 KB
/
checkPuncuationOfStrings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import csv
import argparse
import re
from datetime import datetime
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', help='enter filename with csv.')
parser.add_argument('-b', '--batch', help='Batch letter to name outputs.')
args = parser.parse_args()
if args.file:
filename = args.file
else:
filename = input('Enter filename (including \'.csv\'): ')
if args.batch:
batch = args.batch
else:
batch = input('Enter batch letter: ')
def findError(search, value):
newList = re.findall(search, newValue)
errorCount = len(newList)
return errorCount
def findCategory(errorCount, threshold, type):
if errorCount >= threshold:
newDict['category'] = type
else:
pass
def manuallyCheck(errorCount, threshold):
if errorCount >= threshold:
newDict['check'] = 'yes'
else:
pass
all_items = []
with open(filename) as itemMetadataFile:
itemMetadata = csv.DictReader(itemMetadataFile)
for row in itemMetadata:
uri = row['uri']
oldValue = row['dc.subject']
newValue = row['newValue']
category = row['category']
newDict = {'uri': uri, 'dc.subject': oldValue, 'newValue': newValue}
newDict['check'] = 'no'
newDict['category'] = category
matchCommas = findError(r',', newValue)
matchNewLines = findError(r'(\n|\r\n)', newValue)
matchPeriods = findError(r'\.', newValue)
matchColons = findError(r':', newValue)
findCategory(matchCommas, 2, 'list')
findCategory(matchNewLines, 2, 'list')
findCategory(matchColons, 1, 'bad')
manuallyCheck(matchPeriods, 1)
if matchCommas == 1:
newDict['check'] = 'yes'
else:
pass
all_items.append(newDict)
df = pd.DataFrame.from_dict(all_items)
check_counts = df.check.value_counts(dropna=False)
cat_counts = df.category.value_counts(dropna=False)
print(check_counts)
print(cat_counts)
dt = datetime.now().strftime('%Y-%m-%d %H.%M.%S')
newFile = '02_deDuplicatedSubjects_Batch'+batch+'_'+dt+'.csv'
df.to_csv(path_or_buf=newFile, index=False)