-
Notifications
You must be signed in to change notification settings - Fork 0
테스트용 가상 데이터 생성
ISO edited this page Aug 28, 2023
·
2 revisions
- 수백만 건의 데이터를 생성할 것을 가정한다.
- 옵션 별로 다양한 선택률을 가정하여 옵션 가중치로 랜덤하게 생성한다.
Python 생성 코드
import random
import datetime
class estimate:
def __init__(self, id, detail_trim_id, trim_exterior_color_id, trim_interior_color_id, options, packages) -> None:
self.id = id
self.detail_trim_id = detail_trim_id
self.trim_exterior_color_id = trim_exterior_color_id
self.trim_interior_color_id = trim_interior_color_id
self.options = options
self.packages = packages
def __eq__(self, __value: object) -> bool:
if isinstance(__value, estimate) :
return self.detail_trim_id == __value.detail_trim_id and \
self.trim_exterior_color_id == __value.trim_exterior_color_id and \
self.trim_interior_color_id == __value.trim_interior_color_id and \
self.options == __value.options and \
self.packages == __value.packages
def __repr__(self) -> str:
return str(self.id)
def __str__(self) -> str:
return str(self.options)
exteriorColor = [{},{"start": 1, "end": 6}, {"start": 7, "end": 12}, {"start": 13, "end": 18}, {"start": 19, "end": 25}]
interiorColor = [(),(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,8),(9,10),(11,12),(13,14),(15,16),(17,18),(19,20)]
options = [97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107]
option_weights = [100, 70, 55, 45, 34, 28, 21, 16, 12, 8, 3]
packages = [1, 2, 3, 4]
package_weights = [10, 4, 2, 1]
def randomTimeStamp():
# 현재 시간
current_timestamp = datetime.datetime.now().timestamp()
# 1년전 시간
one_year_ago = current_timestamp - (365 * 24 * 60 * 60)
# 현재 시간 ~ 1년 시간 랜덤으로 가져오기 =
random_timestamp = random.uniform(one_year_ago, current_timestamp)
random_datetime = datetime.datetime.fromtimestamp(random_timestamp)
return random_datetime.strftime('%Y-%m-%d %H:%M:%S')
def select_random_elements(input_list:list, result_list: list, prob:list, m, mm):
if m == mm:
return result_list
privot = random.randint(1,100)
if prob[m] >= privot:
result_list.append(input_list[m])
return select_random_elements(input_list, result_list, prob, m + 1, mm)
def create():
trimId = 2
detailTrimId = random.randint(9, 16)
exteriorColorId = random.randint(exteriorColor[trimId].get("start"), exteriorColor[trimId].get("end"))
interiorColorId = random.randint(interiorColor[exteriorColorId][0], interiorColor[exteriorColorId][1])
return detailTrimId, exteriorColorId, interiorColorId
def estimatesQuery(detailTrimId, exteriorColorId, interiorColorId, estimateId, separate = '|'):
return str(estimateId) + separate + str(detailTrimId) + separate + str(exteriorColorId) + separate + str(interiorColorId) + "\n"
def estimateOptionsQuery():
select_options = select_random_elements(options, [], option_weights, 0, len(options))
return select_options
def estimatePackagesQuery():
select_packages = select_random_elements(packages, [], package_weights, 0, len(packages))
return select_packages
if __name__ == "__main__" :
estimate_list = []
estimate_list_index = {
9 :{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}},
10:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}},
11:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}},
12:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}},
13:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}},
14:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}},
15:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}},
16:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}}
}
data_count = 4000001
release_record_csv = open("release_records.csv", "w+")
estimates_csv = open("estimates.csv", "w+")
estimate_options_csv = open("estimate_options.csv", "w+")
estimate_packages_csv = open("estimate_packages.csv", "w+")
estimates_csv.write("id|detail_trim_id|trim_exterior_color_id|trim_interior_color_id" + "\n")
estimate_options_csv.write("estimate_id|model_option_id" + "\n")
estimate_packages_csv.write("estimate_id|model_package_id" + "\n")
release_record_csv.write("id|estimate_id|create_date" + "\n")
ESTID = 1
for estimateId in range(1, data_count) :
if estimateId % 10000 == 0:
print(estimateId)
createReturn = create()
cur_estimate = estimate(estimateId, createReturn[0], createReturn[1], createReturn[2], estimateOptionsQuery(), estimatePackagesQuery())
estimate_id_target = estimateId
isNewEst = True
for idx in range(len(estimate_list_index[createReturn[0]][createReturn[1]][createReturn[2]])) :
## 존재하는 견적서
if cur_estimate == estimate_list_index[createReturn[0]][createReturn[1]][createReturn[2]][idx] :
cur_estimate = estimate_list_index[createReturn[0]][createReturn[1]][createReturn[2]][idx]
isNewEst = False
break
separate = '|'
## 새로운 견적서
if isNewEst == True :
cur_estimate.id = ESTID
estimate_list.append(cur_estimate)
estimate_list_index[createReturn[0]][createReturn[1]][createReturn[2]].append(cur_estimate)
estimates_csv.write(
str(cur_estimate.id) + separate + str(cur_estimate.detail_trim_id) + separate +
str(cur_estimate.trim_exterior_color_id) + separate + str(cur_estimate.trim_interior_color_id) + "\n"
)
estimate_options_csv.writelines(
[str(cur_estimate.id) + separate + str(option) + "\n" for option in cur_estimate.options]
)
estimate_packages_csv.writelines(
[str(cur_estimate.id) + separate + str(package) + "\n" for package in cur_estimate.packages]
)
ESTID += 1
release_record_csv.write(
separate.join([str(estimateId), str(cur_estimate.id), randomTimeStamp()]) + "\n"
)
estimates_csv.close()
estimate_options_csv.close()
estimate_packages_csv.close()
- 생성된 견적서를 사용하여 필요한 초기 지연 테이블을 생성한다.
Python 생성 코드
option_to_hash_tag = {
97: ["장거리 운전", "안전", "주차/출차"],
98: ["레저", "스타일"],
99: ["쾌적"],
100: ["스타일"],
101: ["쾌적", "자녀", "다인가족"],
102: ["자녀 통학", "안전", "다인가족"],
103: ["레저", "국내여행", "쾌적", "장거리 운전"],
104: ["스타일"],
105: ["레저", "국내여행", "캠핑"],
106: ["스타일"],
107: ["스타일"],
108: ["스타일", "주차/출차"]
}
package_to_hash_tag = {
1: ["레저", "스타일", "안전", "자녀", "반려동물"],
2: ["스마트", "장거리 운전", "안전", "고속도로"],
3: ["스마트", "주차/출차", "초보운전"],
4: ["레저", "국내여행", "캠핑"]
}
# 기존의 option_to_hash_tag 및 package_to_hash_tag 데이터
# ...
def split_estimate_line(estimates_line: str, separate='|'):
return [int(item) for item in estimates_line.split(separate)]
estimate_path = "estimates.csv"
estimate_options_path = "estimate_options.csv"
estimate_packages_path = "estimate_packages.csv"
with open(estimate_path, "r") as estimate_file:
estlimate_lines = estimate_file.readlines()[1:]
with open(estimate_options_path, "r") as options_file:
options_lines = options_file.readlines()[1:]
with open(estimate_packages_path, "r") as package_file:
package_lines = package_file.readlines()[1:]
# 데이터를 그룹화할 딕셔너리 생성
grouped_data = {}
varli = 1
for estimate_line in estlimate_lines:
grouped_data[varli] = {"options": [], "packages": []}
varli += 1
for options_line in options_lines:
options_data = split_estimate_line(options_line.strip())
group_option_key = options_data[0]
if group_option_key not in grouped_data:
grouped_data[group_option_key] = {"options": [], "packages": []}
grouped_data[group_option_key]["options"].append(options_data[1])
for package_line in package_lines:
package_data = split_estimate_line(package_line.strip())
group_package_key = package_data[0]
if group_package_key not in grouped_data:
grouped_data[group_package_key] = {"options": [], "packages": []}
grouped_data[group_package_key]["packages"].append(package_data[1])
def build_hash_tag(hash_tags: list, hash_tag_map: dict):
for hash_tag in hash_tags:
if hash_tag not in hash_tag_map:
hash_tag_map[hash_tag] = 0
hash_tag_map[hash_tag] += 1
# 해시 태그 생성 함수 수정
def make_hash(data, hash_tags: map):
option_ids = data["options"]
for id in option_ids:
hash_tag = option_to_hash_tag.get(id, [])
build_hash_tag(hash_tag, hash_tags)
return hash_tags
def make_hash2(data, hash_tags: map):
package_ids = data["packages"]
for id in package_ids:
hash_tag = package_to_hash_tag.get(id, [])
build_hash_tag(hash_tag, hash_tags)
return hash_tags
def smart_transform(sorted_hash_tag_map: map):
return "?".join([f"{key}:{value}" for key, value in sorted_hash_tag_map.items()])
hashList = []
hashList_reverse = {}
queue_csv = open("pending_hash_tag_similarities.csv", "w+")
queue_csv.write("idx|hash_tag_key|trim_id|last_index_calculated\n")
similirty_csv = open("similar_estimates.csv", "w+")
similirty_csv.write("hash_tag_index|estimate_id\n")
idx = 1
for data_key, data_value in grouped_data.items():
if data_key % 10000 == 0:
print(data_key)
hash_tags = {}
make_hash(data_value, hash_tags)
make_hash2(data_value, hash_tags)
from collections import OrderedDict
sorted_hash_tag_map = OrderedDict(sorted(hash_tags.items()))
smart = smart_transform(sorted_hash_tag_map)
if len(smart) != 0 and smart not in hashList:
queue_csv.write(str(idx) + "|" + smart + "|" + str(2) + "|" + str(0) + "\n")
hashList.append(smart)
hashList_reverse[smart] = idx
idx += 1
if len(smart) != 0:
similirty_csv.write(str(hashList_reverse[smart]) + "|" + str(data_key) + "\n")
queue_csv.close()
- 가격 컬럼 역정규화를 지원한다.
- 빠른 평균 가격 서비스를 지원하기 위해 기존에 생성한 견적서에 추가로 가격을 계산하여 추가한다.
Python 생성 코드
def split_estimate_line(estimates_line: str, separate='|'):
return [int(item) for item in estimates_line.split(separate)]
detailTrimIdPrice = {
9: 38960000 + 1480000,
10: 38960000 + 1480000,
11: 38960000 + 1480000 + 2370000,
12: 38960000 + 1480000 + 2370000,
13: 38960000,
14: 38960000,
15: 38960000 + 2370000,
16: 38960000 + 237000
}
optionPrice = {
97: 690000,
98: 890000,
99: 400000,
100: 840000,
101: 400000,
102: 350000,
103: 300000,
104: 490000,
105: 840000,
106: 840000,
107: 3660000
}
packagePrice = {
1: 1090000,
2: 790000,
3: 690000,
4: 250000
}
exteriorPrice = {
"7": 0,
"8": 0,
"9": 0,
"10": 0,
"11": 0,
"12": 100000,
}
with open("estimates.csv", "r") as estimate_file:
estlimate_lines = estimate_file.readlines()[1:]
with open("estimate_options.csv", "r") as options_file:
options_lines = options_file.readlines()[1:]
with open("estimate_packages.csv", "r") as package_file:
package_lines = package_file.readlines()[1:]
# 데이터를 그룹화할 딕셔너리 생성
grouped_data = {}
index = 0
for estimate in estlimate_lines:
index += 1
estimate_data = split_estimate_line(estimate.strip())
grouped_data[index] = {"price": detailTrimIdPrice[estimate_data[1]] + exteriorPrice[str(estimate_data[2])]}
for options_line in options_lines:
options_data = split_estimate_line(options_line.strip())
group_option_key = options_data[0]
grouped_data[group_option_key]["price"] = grouped_data[group_option_key]["price"] + optionPrice[options_data[1]]
for package_line in package_lines:
package_data = split_estimate_line(package_line.strip())
group_package_key = package_data[0]
grouped_data[group_package_key]["price"] = grouped_data[group_package_key]["price"] + packagePrice[package_data[1]]
new_estimates_csv = open("new_estimates.csv", "w+")
new_estimates_csv.write("id|detail_trim_id|trim_exterior_color_id|trim_interior_color_id|price" + "\n")
for estimate in estlimate_lines:
estimate_data = split_estimate_line(estimate.strip())
if estimate_data[0] % 10000 == 0:
print(estimate_data[0])
estimate = estimate.rstrip("\n")
new_estimates_csv.write(estimate + "|" + str(grouped_data[estimate_data[0]]["price"]) + "\n")
new_estimates_csv.close()