-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEstraiPaperPerPaper.py
102 lines (89 loc) · 3.88 KB
/
EstraiPaperPerPaper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#! python2
import os
import multiprocessing as mp
from timeit import default_timer as timer
import traceback
import re
# from Updater import Upd
def processaPAApaper(pfPapAutAffRAW, chunkStart, chunkSize, sIDpap):
try:
#carica solo le linee da processare
with open(pfPapAutAffRAW, 'rb') as fPapAutAffRAW:
fPapAutAffRAW.seek(chunkStart)
lines = fPapAutAffRAW.read(chunkSize).splitlines()
#print 'PPA: chunkStart: {} chunkSize: {} pfPapAutAffRAW: {} lines opened: {}'.format(chunkStart, chunkSize, pfPapAutAffRAW, lines)
chuRes = ''
for line in lines:
pezzi = line.split('\t') # IDpap-IDaut-IDaff; non sempre IDaff presente
if len(pezzi)>=2:
if pezzi[0] in sIDpap:
# if pezzi[1] in sIDpap or re.search('pad(ov|u)a', line, re.I): # che tanto non serve a niente perche trovo paper scritti a padova da autori che non sono nel dei
# if re.search('pad(ov|u)a', line, re.I): # 352s # da questa lista di paper posso estrarre affiliation padovane che forse non erano in PadovaPadua
# chuRes += '{}\n'.format(line)
chuRes += '{}\r\n'.format(line.rstrip())
else:
'errore alla linea {}'.format(line)
return chuRes
except:
traceback.print_exc()
raise
def chunkMyFile(fpath, roughSize):
fileEnd = os.path.getsize(fpath)
with open(fpath, 'rb') as f:
chunkEnd = f.tell()
while True:
chunkStart = chunkEnd
f.seek(roughSize, 1) #1 rispetto alla pos corrente - os.SEEK_CUR
f.readline() #il chunk finisce alla fine della riga
chunkEnd = f.tell()
#print 'CMF: chunkStart: {} chunkSize: {} pfPapAutAffRAW: {}'.format(chunkStart, chunkEnd-chunkStart, fpath)
yield chunkStart, chunkEnd-chunkStart #lo uso come generatore
if chunkEnd > fileEnd: #EOF superata
break
def estraiPapAutAffPerPaper(pfPaperID, pfPapAutAffRAW, pfPapAutAff):
"""
in pfPaperID ci sono record IDpap-IDaut-IDaff, carico gli IDpap nel set
in pfPapAutAffRAW ci sono record IDpap-IDaut-IDaff, se IDaut e' nel set allora
in pfPapAutAff salvo i record
"""
# print 'pfPapAutAff:{}\tpfPapAutAffRAW:{}\tpfPaperID:{}'.format(pfPapAutAff, pfPapAutAffRAW, pfPaperID)
# proceso scrittore con coda dei risultati scitti subito ???
sIDpap = set()
with open(pfPaperID, 'rb') as fPaperID:
for line in fPaperID:
sIDpap.add(line.split('\t')[0])
roughSize = 1024*1024*128
pool = mp.Pool(mp.cpu_count())
lresult = []
sizePAAraw = os.path.getsize(pfPapAutAffRAW)
print 'sizePaperAuthorAffiliationRAW: {} chunks: {} roughSize: {}'.format(sizePAAraw, sizePAAraw/roughSize, roughSize)
# up = Upd(sizePAAraw/roughSize)
for chunkStart, chunkSize in chunkMyFile(pfPapAutAffRAW, roughSize):
lresult.append(pool.apply_async(processaPAApaper,(pfPapAutAffRAW, chunkStart, chunkSize, sIDpap) ) )
with open(pfPapAutAff, 'wb') as fPapAutAff:
for r in lresult:
fPapAutAff.write(r.get())
# up.update('next')
pool.close()
# del up
if __name__ == '__main__':
print 'This program is EstraiPaperPerPaper, being run by itself'
#PATH TO FILES
celaborati = 'Versione3_Upd\\'
pfPaperID = celaborati + 'AutoriDEI.txt'
pfPaperID = celaborati + 'AutoriDEIMacroFull.txt'
pfPaperID = celaborati + 'AutoriDEIupd.txt'
#pfPapAutAffRAW = '..\FileRAW\PaperAuthorAffiliations5000000.txt'
pfPapAutAffRAW = '..\FileRAW\PaperAuthorAffiliations.txt'
# pfPapAutAffRAW = '..\FileRAW\PaperAuthorAffiliations1000.txt'
# pfPapAutAffRAW = '..\FileRAW\PaperAuthorAffiliations500.txt'
pfPapAutAff = celaborati + 'PapAutAffDEImultiFull.txt'
pfPapAutAff = celaborati + 'PapAutAffDEIupd.txt'
start = timer()
estraiPapAutAffPerPaper(pfPaperID, pfPapAutAffRAW, pfPapAutAff)
end = timer()
print 'Completato estraiPapAutAffPerPaper in {}'.format(end-start)
else:
pass
#tutti i processi figli eseguono questo codice
#print 'I am EstraiPapAutAffPerPaper, being imported from another module'