-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleaner.py
65 lines (57 loc) · 2.16 KB
/
cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Just put the logcat files (ending in .txt) on the same folder
# This program will filter files and output clean_(original name).txt
import os
import re
import md5
import sys
# remove all empty files
def remove_empties(dir, subname):
target_size = 0
for dirpath, dirs, files in os.walk(dir):
for file in files:
if not file.endswith(subname):
continue
path = os.path.join(dirpath, file)
if os.stat(path).st_size == target_size:
os.remove(path)
print "Empty File: " + path + " removed."
# remove duplicate files
def remove_duplicates(dir, subname):
unique = []
for filename in os.listdir(dir):
if not filename.endswith(subname):
continue
if os.path.isdir(filename):
continue
if os.path.isfile(filename):
filehash = md5.md5(file(filename).read()).hexdigest()
if filehash not in unique:
unique.append(filehash)
else:
os.remove(filename)
print "Duplicated File: " + filename + " removed."
# remore logcat time stamps, spaces, and unnecessary stuffs
def remove_format(dir, subname):
for filename in os.listdir(dir):
if not filename.endswith(subname) or filename.startswith("cleaned_"):
continue
with open(filename) as f:
lines = f.readlines()
newlines = []
for l in lines:
# NOT taking out timestamp for now
# take out digits(time stamp) in the beginning of each line
# temp = re.sub("^[(0-9)|( :.\\-)]+", "", l).strip()
# minize continuous spaces to get consistency
temp = re.sub("[ ]+", " ", l).strip()
# remove (number) before starting of message
temp = re.sub("[ ]*\\([ ]*[0-9]+\\):", ":", temp).strip()
newlines.append(temp)
with open('cleaned_' + filename, 'w') as f:
f.write("\n".join(newlines))
if __name__ == '__main__':
current_dir = os.getcwd()
subname = ".txt"
remove_empties(current_dir, subname)
remove_duplicates(current_dir, subname)
remove_format(current_dir, subname)