-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
149 lines (129 loc) · 5.95 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#Packages
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import requests
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
from PIL import Image
import streamlit.components.v1 as components
from pyvis.network import Network
import warnings
warnings.filterwarnings('ignore')
st.set_option('deprecation.showPyplotGlobalUse', False)
# DOM visualizer with functions
# 1. Cleaning html
# 2. Get html source
# 3. Cleaning Soup
# 4. Dataframe Creation
# 5. Data cleaning
# 6. Visualize
class DOM_visualizer:
def cleanMe(self, html):
soup = bs(html, "html5lib")
[x.extract() for x in soup.find_all('script')]
[x.extract() for x in soup.find_all('style')]
[x.extract() for x in soup.find_all('meta')]
[x.extract() for x in soup.find_all('noscript')]
[x.extract() for x in soup.find_all('link')]
[x.extract() for x in soup.find_all('br')]
[x.extract() for x in soup.find_all(text=lambda text:isinstance(text, Comment))]
return soup
def get_html_source(self, url):
source_code = requests.get(url)
soup = bs(source_code.content, 'html.parser')
soup = source_code.content
return soup
def clean_html_soup(self, soup):
cleaned_html = self.cleanMe(str(soup))
return cleaned_html
def dataframe_creation(self, html):
data = pd.DataFrame()
source = []
target = []
for tag in html.find_all(True):
child = tag.children
for i in child:
source.append(tag.name)
target.append(i.name)
data['source'] = source
data['target'] = target
return data
def data_clean(self,df):
df = df.dropna()
df.drop_duplicates(keep=False,inplace=True)
return df
def visualize(self, df):
plt.figure(figsize=(20, 10))
G = nx.from_pandas_edgelist(df, source='source', target='target', create_using=nx.DiGraph())
node_limit = len(G.nodes)
cmap = plt.cm.spring
colors = range(node_limit)
vmin = min(colors)
vmax = max(colors)
pos = graphviz_layout(G, prog='dot')
nx.draw(G,pos, with_labels=True, node_size=5000,font_size=20, font_color='black', node_color=range(node_limit), cmap=cmap)
sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin = vmin, vmax=vmax))
sm._A = []
plt.colorbar(sm)
plt.title('DOM Tree Visualization')
plt.savefig("DOM_Tree_viz.png", dpi=300)
plt.show()
st.pyplot()
def interactive_plot(self, df):
plt.figure(figsize=(20, 10))
G = nx.from_pandas_edgelist(df, source='source', target='target', create_using=nx.DiGraph())
node_limit = len(G.nodes)
pos = graphviz_layout(G, prog='dot')
nx.draw(G,pos, with_labels=True, node_size=5000,font_size=20, font_color='black', node_color=range(node_limit), cmap=plt.cm.spring)
plt.title('DOM Tree Visualization')
nt = Network("500px", "500px")
nt.from_nx(G)
nt.toggle_physics(True)
nt.save_graph('dom_tree_inter_viz.html')
def visualize_pyvis_graph(self):
HtmlFile = open("dom_tree_inter_viz.html", 'r', encoding='utf-8')
source_code = HtmlFile.read()
components.html(source_code, width=1000, height=500)
dom = DOM_visualizer()
image = Image.open('images/DOM_Tree_viz.png')
st.image(image, caption='Document Object Model Tree Visualization', width=150)
st.title("DOM Tree Visualization")
url = st.text_input("Enter Url:")
inter_plot = st.checkbox("interactive plot")
visualize = st.button("Visualize DOM")
if visualize:
if url != '':
html_source = dom.get_html_source(url)
clean_html = dom.clean_html_soup(html_source)
html_dataframe = dom.dataframe_creation(clean_html)
clean_html_dataframe = dom.data_clean(html_dataframe)
dom.visualize(clean_html_dataframe)
if visualize:
if url != '':
if inter_plot:
html_source = dom.get_html_source(url)
clean_html = dom.clean_html_soup(html_source)
html_dataframe = dom.dataframe_creation(clean_html)
clean_html_dataframe = dom.data_clean(html_dataframe)
dom.interactive_plot(clean_html_dataframe)
dom.visualize_pyvis_graph()
Topics = pd.DataFrame()
Topics['topics'] = ['Select topic','What is DOM', 'Types of DOM','HTML DOM']
option = st.sidebar.selectbox(
'Select the Topic',
Topics['topics'])
if option == 'Select topic':
option = ''
if option == 'What is DOM':
st.write(option)
components.html("<ul><li>The DOM is a W3C (World Wide Web Consortium) standard.</li><li>The DOM defines a standard for accessing documents</li><li>The W3C Document Object Model (DOM) is a platform and language-neutral interface that allows programs and scripts to dynamically access and update the content, structure, and style of a document.</li></ul>", width=700, height=500)
if option == 'Types of DOM':
st.write(option)
components.html("<ul><li>Core DOM - standard model for all document types</li><li>XML DOM - standard model for XML documents</li><li>HTML DOM - standard model for HTML documents</li></ul>", width=700, height=500)
if option == 'HTML DOM':
st.write(option)
components.html("<p>The HTML DOM is a standard <strong>object</strong> model and <strong>programming interface</strong> for HTML. It defines:</p><ul><li>The HTML elements as <b>objects</b></li><li>The <b>properties</b> of all HTML elements</li><li>The <b>methods</b> to access all HTML elements</li><li>The <b>events</b> for all HTML elements</li></ul><p>In other words: The HTML DOM is a standard for <b>how to get, change, add, or delete HTML elements.</b></p>", width=700, height=500)