Skip to content

테스트용 가상 데이터 생성

ISO edited this page Aug 28, 2023 · 2 revisions

견적서 레코드 생성

  • 수백만 건의 데이터를 생성할 것을 가정한다.
  • 옵션 별로 다양한 선택률을 가정하여 옵션 가중치로 랜덤하게 생성한다.
Python 생성 코드
import random
import datetime

class estimate:
    def __init__(self, id, detail_trim_id, trim_exterior_color_id, trim_interior_color_id, options, packages) -> None:
        self.id = id
        self.detail_trim_id = detail_trim_id
        self.trim_exterior_color_id = trim_exterior_color_id
        self.trim_interior_color_id = trim_interior_color_id
        self.options = options
        self.packages = packages

    def __eq__(self, __value: object) -> bool:
        if isinstance(__value, estimate) :
            return self.detail_trim_id == __value.detail_trim_id and \
                self.trim_exterior_color_id == __value.trim_exterior_color_id and \
                self.trim_interior_color_id == __value.trim_interior_color_id and \
                self.options == __value.options and \
                self.packages == __value.packages 
    
    def __repr__(self) -> str:
        return str(self.id)

    def __str__(self) -> str:
        return str(self.options)

exteriorColor = [{},{"start": 1, "end": 6}, {"start": 7, "end": 12}, {"start": 13, "end": 18}, {"start": 19, "end": 25}]
interiorColor = [(),(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,8),(9,10),(11,12),(13,14),(15,16),(17,18),(19,20)]

options = [97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107]
option_weights = [100, 70, 55, 45, 34, 28, 21, 16, 12, 8, 3]

packages = [1, 2, 3, 4]
package_weights = [10, 4, 2, 1]

def randomTimeStamp():
    # 현재 시간
    current_timestamp = datetime.datetime.now().timestamp()

    # 1년전 시간
    one_year_ago = current_timestamp - (365 * 24 * 60 * 60)

    # 현재 시간 ~ 1년 시간 랜덤으로 가져오기 =
    random_timestamp = random.uniform(one_year_ago, current_timestamp)
    random_datetime = datetime.datetime.fromtimestamp(random_timestamp)
    return random_datetime.strftime('%Y-%m-%d %H:%M:%S')

def select_random_elements(input_list:list, result_list: list, prob:list, m, mm):
    if m == mm:
        return result_list
    
    privot = random.randint(1,100)
    if prob[m] >= privot:
        result_list.append(input_list[m])
    return select_random_elements(input_list, result_list, prob, m + 1, mm)

def create():
    trimId = 2
    detailTrimId = random.randint(9, 16)
    exteriorColorId = random.randint(exteriorColor[trimId].get("start"), exteriorColor[trimId].get("end"))
    interiorColorId = random.randint(interiorColor[exteriorColorId][0], interiorColor[exteriorColorId][1])

    return detailTrimId, exteriorColorId, interiorColorId

def estimatesQuery(detailTrimId, exteriorColorId, interiorColorId, estimateId, separate = '|'):
    return str(estimateId) + separate + str(detailTrimId) + separate + str(exteriorColorId) + separate + str(interiorColorId) + "\n"

def estimateOptionsQuery():
    select_options = select_random_elements(options, [], option_weights, 0, len(options))
    return select_options

def estimatePackagesQuery():
    select_packages = select_random_elements(packages, [], package_weights, 0, len(packages))
    return select_packages


if __name__ == "__main__" :
    estimate_list = []
    estimate_list_index = {
        9 :{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}}, 
        10:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}}, 
        11:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}}, 
        12:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}}, 
        13:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}}, 
        14:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}}, 
        15:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}}, 
        16:{7:{7:[], 8:[]}, 8:{9:[], 10:[]}, 9:{11:[], 12:[]}, 10:{13:[], 14:[]}, 11:{15:[], 16:[]}, 12:{17:[], 18:[]}}
        }
    data_count = 4000001

    release_record_csv = open("release_records.csv", "w+")
    estimates_csv = open("estimates.csv", "w+")
    estimate_options_csv = open("estimate_options.csv", "w+")
    estimate_packages_csv = open("estimate_packages.csv", "w+")

    estimates_csv.write("id|detail_trim_id|trim_exterior_color_id|trim_interior_color_id" + "\n")
    estimate_options_csv.write("estimate_id|model_option_id" + "\n")
    estimate_packages_csv.write("estimate_id|model_package_id" + "\n")
    release_record_csv.write("id|estimate_id|create_date" + "\n")

    ESTID = 1
    for estimateId in range(1, data_count) :
        if estimateId % 10000 == 0:
            print(estimateId)
        createReturn = create()

        cur_estimate = estimate(estimateId, createReturn[0], createReturn[1], createReturn[2], estimateOptionsQuery(), estimatePackagesQuery())

        
        estimate_id_target = estimateId

        isNewEst = True
        
        for idx in range(len(estimate_list_index[createReturn[0]][createReturn[1]][createReturn[2]])) :
            ## 존재하는 견적서
            if cur_estimate == estimate_list_index[createReturn[0]][createReturn[1]][createReturn[2]][idx] :
                cur_estimate = estimate_list_index[createReturn[0]][createReturn[1]][createReturn[2]][idx]
                isNewEst = False
                break

        separate = '|'
        ## 새로운 견적서
        if isNewEst == True :
            cur_estimate.id = ESTID
            estimate_list.append(cur_estimate)
            estimate_list_index[createReturn[0]][createReturn[1]][createReturn[2]].append(cur_estimate)
            estimates_csv.write(
                str(cur_estimate.id) + separate + str(cur_estimate.detail_trim_id) + separate + 
                str(cur_estimate.trim_exterior_color_id) + separate + str(cur_estimate.trim_interior_color_id) + "\n"
            )

            estimate_options_csv.writelines(
                [str(cur_estimate.id) + separate + str(option) + "\n" for option in cur_estimate.options]
            )
            
            estimate_packages_csv.writelines(
                [str(cur_estimate.id) + separate + str(package) + "\n" for package in cur_estimate.packages]
            )
            ESTID += 1
                         
        
        release_record_csv.write(
            separate.join([str(estimateId), str(cur_estimate.id), randomTimeStamp()]) + "\n"
        )
    estimates_csv.close()
    estimate_options_csv.close()
    estimate_packages_csv.close()

초기 지연 레코드 생성

  • 생성된 견적서를 사용하여 필요한 초기 지연 테이블을 생성한다.
Python 생성 코드
option_to_hash_tag = {
    97: ["장거리 운전", "안전", "주차/출차"],
    98: ["레저", "스타일"],
    99: ["쾌적"],
    100: ["스타일"],
    101: ["쾌적", "자녀", "다인가족"],
    102: ["자녀 통학", "안전", "다인가족"],
    103: ["레저", "국내여행", "쾌적", "장거리 운전"],
    104: ["스타일"],
    105: ["레저", "국내여행", "캠핑"],
    106: ["스타일"],
    107: ["스타일"],
    108: ["스타일", "주차/출차"]
}

package_to_hash_tag = {
    1: ["레저", "스타일", "안전", "자녀", "반려동물"],
    2: ["스마트", "장거리 운전", "안전", "고속도로"],
    3: ["스마트", "주차/출차", "초보운전"],
    4: ["레저", "국내여행", "캠핑"]
}
# 기존의 option_to_hash_tag 및 package_to_hash_tag 데이터
# ...

def split_estimate_line(estimates_line: str, separate='|'):
    return [int(item) for item in estimates_line.split(separate)]

estimate_path = "estimates.csv"
estimate_options_path = "estimate_options.csv"
estimate_packages_path = "estimate_packages.csv"

with open(estimate_path, "r") as estimate_file:
    estlimate_lines = estimate_file.readlines()[1:]

with open(estimate_options_path, "r") as options_file:
    options_lines = options_file.readlines()[1:]

with open(estimate_packages_path, "r") as package_file:
    package_lines = package_file.readlines()[1:]

# 데이터를 그룹화할 딕셔너리 생성
grouped_data = {}

varli = 1
for estimate_line in estlimate_lines:
    grouped_data[varli] = {"options": [], "packages": []}
    varli += 1

for options_line in options_lines:
    options_data = split_estimate_line(options_line.strip())

    group_option_key = options_data[0]

    if group_option_key not in grouped_data:
        grouped_data[group_option_key] = {"options": [], "packages": []}

    grouped_data[group_option_key]["options"].append(options_data[1])

for package_line in package_lines:
    package_data = split_estimate_line(package_line.strip())

    group_package_key = package_data[0]

    if group_package_key not in grouped_data:
        grouped_data[group_package_key] = {"options": [], "packages": []}

    grouped_data[group_package_key]["packages"].append(package_data[1])

def build_hash_tag(hash_tags: list, hash_tag_map: dict):
    for hash_tag in hash_tags:
        if hash_tag not in hash_tag_map:
            hash_tag_map[hash_tag] = 0
        hash_tag_map[hash_tag] += 1

# 해시 태그 생성 함수 수정
def make_hash(data, hash_tags: map):
    option_ids = data["options"]

    for id in option_ids:
        hash_tag = option_to_hash_tag.get(id, [])
        build_hash_tag(hash_tag, hash_tags)
    return hash_tags

def make_hash2(data, hash_tags: map):
    package_ids = data["packages"]

    for id in package_ids:
        hash_tag = package_to_hash_tag.get(id, [])
        build_hash_tag(hash_tag, hash_tags)
    return hash_tags

def smart_transform(sorted_hash_tag_map: map):
    return "?".join([f"{key}:{value}" for key, value in sorted_hash_tag_map.items()])

hashList = []
hashList_reverse = {}

queue_csv = open("pending_hash_tag_similarities.csv", "w+")
queue_csv.write("idx|hash_tag_key|trim_id|last_index_calculated\n")

similirty_csv = open("similar_estimates.csv", "w+")
similirty_csv.write("hash_tag_index|estimate_id\n")
idx = 1
for data_key, data_value in grouped_data.items():
    if data_key % 10000 == 0:
        print(data_key)
    hash_tags = {}
    make_hash(data_value, hash_tags)
    make_hash2(data_value, hash_tags)
    from collections import OrderedDict
    sorted_hash_tag_map = OrderedDict(sorted(hash_tags.items()))

    smart = smart_transform(sorted_hash_tag_map)
    if len(smart) != 0 and smart not in hashList:
        queue_csv.write(str(idx) + "|"  + smart + "|" + str(2) + "|" + str(0) + "\n")
        hashList.append(smart)
        hashList_reverse[smart] = idx
        idx += 1

    if len(smart) != 0:
        similirty_csv.write(str(hashList_reverse[smart]) + "|" + str(data_key) + "\n")
    
queue_csv.close()

기존 견적서에 가격 컬럼 확장

  • 가격 컬럼 역정규화를 지원한다.
  • 빠른 평균 가격 서비스를 지원하기 위해 기존에 생성한 견적서에 추가로 가격을 계산하여 추가한다.
Python 생성 코드
def split_estimate_line(estimates_line: str, separate='|'):
    return [int(item) for item in estimates_line.split(separate)]

detailTrimIdPrice = {
    9: 38960000 + 1480000,
    10: 38960000 + 1480000,
    11: 38960000 + 1480000 + 2370000,
    12: 38960000 + 1480000 + 2370000,
    13: 38960000,
    14: 38960000,
    15: 38960000 + 2370000,
    16: 38960000 + 237000
}

optionPrice = {
    97: 690000,
    98: 890000,
    99: 400000,
    100: 840000,
    101: 400000,
    102: 350000,
    103: 300000,
    104: 490000,
    105: 840000,
    106: 840000,
    107: 3660000
}

packagePrice = {
    1: 1090000,
    2: 790000,
    3: 690000,
    4: 250000
}

exteriorPrice =  {
    "7": 0,
    "8": 0,
    "9": 0,
    "10": 0,
    "11": 0,
    "12": 100000,
}



with open("estimates.csv", "r") as estimate_file:
    estlimate_lines = estimate_file.readlines()[1:]

with open("estimate_options.csv", "r") as options_file:
    options_lines = options_file.readlines()[1:]

with open("estimate_packages.csv", "r") as package_file:
    package_lines = package_file.readlines()[1:]

# 데이터를 그룹화할 딕셔너리 생성
grouped_data = {}
    

index = 0
for estimate in estlimate_lines:
    index += 1
    estimate_data = split_estimate_line(estimate.strip())
    grouped_data[index] = {"price": detailTrimIdPrice[estimate_data[1]] + exteriorPrice[str(estimate_data[2])]}

for options_line in options_lines:
    options_data = split_estimate_line(options_line.strip())

    group_option_key = options_data[0]

    grouped_data[group_option_key]["price"] = grouped_data[group_option_key]["price"] + optionPrice[options_data[1]]


for package_line in package_lines:
    package_data = split_estimate_line(package_line.strip())

    group_package_key = package_data[0]

    grouped_data[group_package_key]["price"] = grouped_data[group_package_key]["price"] + packagePrice[package_data[1]]




new_estimates_csv = open("new_estimates.csv", "w+")
new_estimates_csv.write("id|detail_trim_id|trim_exterior_color_id|trim_interior_color_id|price" + "\n")
for estimate in estlimate_lines:
    estimate_data = split_estimate_line(estimate.strip())
    if estimate_data[0] % 10000 == 0:
        print(estimate_data[0])
    estimate = estimate.rstrip("\n")
    new_estimates_csv.write(estimate + "|" + str(grouped_data[estimate_data[0]]["price"]) + "\n")

new_estimates_csv.close()