-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean.py
50 lines (42 loc) · 2.06 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
import io
from google.cloud import vision
from google.cloud.vision import types
""" Removes non image files, large files and unrelated images """
def clean_images(dir, client, MAX_IMAGE_SIZE):
for root, dirs, files in os.walk(dir):
for name in files:
folder_name = os.fsdecode(os.path.basename(os.path.normpath(root)))
file_name, file_extension = os.path.splitext(os.fsdecode(name))
file = os.fsdecode(os.path.join(root, name))
output_dir = os.fsdecode(dir)
if file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".png") or file.endswith(".webp"):
image_path = output_dir + "/" + folder_name + "/" + file_name + file_extension
stat_info = os.stat(image_path)
image_size = stat_info.st_size
if image_size >= MAX_IMAGE_SIZE:
print(file + " removed due to SIZE")
os.remove(file)
else:
with io.open(image_path, 'rb') as image_file:
content = image_file.read()
image = vision.types.Image(content=content)
response = client.label_detection(image=image)
labels = response.label_annotations
image_contains_object = False
for label in labels:
if label.description == "GENERAL LABEL1":
image_contains_object = True
elif label.description == "GENERAL LABEL2":
image_contains_object = True
elif label.description == "GENERAL LABEL3":
image_contains_object = True
if image_contains_object == False:
print(file + " removed due to CONTENTS")
os.remove(file)
else:
print(file)
else:
print(file + " removed due to TYPE")
os.remove(file)
return