-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_networks.py
161 lines (129 loc) · 5.16 KB
/
create_networks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from dataclasses import replace
from os import remove
import string
from bs4 import BeautifulSoup
from lxml import etree
def create_networks(xml_file, nodes_sheet, edges_sheet):
# Reading the data inside the xml file to a variable under the name data
with open(xml_file, "r", encoding="utf-8") as tei:
data = BeautifulSoup(tei, features='lxml')
#CLASS PERSON
#creation of a list with all the tags 'persname' of the original tei file
pers_list=[]
for paragraph in data.find_all('p'):
pers = paragraph.find_all('persname')
for x in pers:
pers_list.append(x)
#list with all the values of the attribute 'ref' of the tags 'persname'
ref_pers_list = []
for x in pers_list:
ref_pers_list.append(x.get("ref"))
#dictionary with as keys the 'ref' of the persname tags and as values the contents of the persname tags
pers_dict = dict()
for person in data.find_all('person'):
pers_ref_from_id = '#'+person.get('xml:id')
pers_label = person.persname.text
pers_dict[pers_ref_from_id]=pers_label
#list with just the contents of the persname tags
pers_label_list=[]
for ref in ref_pers_list:
if ref in pers_dict:
pers_label_list.append(pers_dict[ref])
#list of tuples with the values ('ref', 'label', 'type') of each person
tuples_nodes_pers=[]
for item in pers_list:
tuples_nodes_pers.append((item.get('ref'), pers_dict[(item.get('ref'))], "person"))
#CLASS PLACE
place_list=[]
for paragraph in data.find_all('p'):
place = paragraph.find_all('placename')
for x in place:
place_list.append(x)
ref_place_list = []
for x in place_list:
ref_place_list.append(x.get("ref"))
place_dict = dict()
for place in data.find_all('place'):
place_ref_from_id = '#'+place.get('xml:id')
place_label = place.placename.text
place_dict[place_ref_from_id]=place_label
place_label_list=[]
for ref in ref_place_list:
if ref in place_dict:
place_label_list.append(place_dict[ref])
tuples_nodes_place=[]
for item in place_list:
tuples_nodes_place.append((item.get('ref'), place_dict[(item.get('ref'))], "place"))
#CLASS PROPERTY
property_list=[]
for paragraph in data.find_all('p'):
property = paragraph.find_all('objectname')
for x in property:
property_list.append(x)
ref_prop_list = []
for x in property_list:
ref_prop_list.append(x.get("ref"))
prop_dict = dict()
for prop in data.find_all('object'):
prop_ref_from_id = '#'+prop.get('xml:id')
prop_label = prop.objectname.text
prop_dict[prop_ref_from_id]=prop_label
prop_label_list=[]
for ref in ref_prop_list:
if ref in prop_dict:
prop_label_list.append(prop_dict[ref])
tuples_nodes_prop=[]
for item in property_list:
tuples_nodes_prop.append((item.get('ref'), prop_dict[(item.get('ref'))], "property"))
#CSV - NODES_SHEET
import csv
with open(nodes_sheet, 'w', encoding='UTF-8', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Id", "Label", "Type"])
for item_1 in tuples_nodes_pers:
writer.writerow(item_1)
for item_2 in tuples_nodes_place:
writer.writerow(item_2)
for item_3 in tuples_nodes_prop:
writer.writerow(item_3)
#creation of a list with all the tags 'persname', 'placename' and 'objectname' of the original tei file
#that appear in the same paragraph
p_list=[]
for p in data.find_all('p'):
persons = p.find_all('persname')
places = p.find_all('placename')
objects = p.find_all('objectname')
if persons or places or objects:
p_list.append([persons + places + objects])
#using list comprehension
#Removing empty lists from p_list, which is a list of lists
N = []
p_list = [[ele for ele in sub if ele != N] for sub in p_list]
#replacement of the tags with their corresponding ref values
for x in p_list:
for y in x:
i = 0
while i < len(y):
y[i] = y[i].get("ref")
i += 1
#creation of a list with tuples containing the ref values of all the nodes that appear in the same paragraph
tuples_edges=[]
for x in p_list:
for y in x:
if len(y) > 0:
tuples_edges.append(tuple(y))
#Create a list of tuples associating two by two the items of each tuple in tuples_edges together
tpl_lst1 = list()
for paragraph in tuples_edges:
for i in range(len(paragraph)):
tpl1 = tuple((paragraph[0], paragraph[i]))
tpl_lst1.append(tpl1)
#CSV - EDGES_SHEET
import csv
with open(edges_sheet, 'w', encoding='UTF-8', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Source", "Target"])
for item in tpl_lst1:
writer.writerow(item)
#create_networks("Ignazio_Paolo.xml", "nodes_sheet.csv", "edges_sheet.csv")
#create_networks("Vincenzo.xml", "vincenzo_nodes.csv", "vincenzo_edges.csv")