diff --git a/docker/lakesoul-ann-benchmark-env/README.md b/docker/lakesoul-ann-benchmark-env/README.md
new file mode 100644
index 000000000..8d4783d5c
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/README.md
@@ -0,0 +1,30 @@
+# LakeSoul ANN Benchmark Environment
+
+## 运行步骤
+
+1. 编译测试类
+```bash
+bash compile_test_class.sh
+```
+这个脚本会编译所需的测试类文件。
+
+2. 启动所有必需的容器
+```bash
+bash start_all.sh
+```
+此脚本将启动以下服务:
+- PostgreSQL
+- Spark
+- MinIO
+
+3. 准备 ANN 测试表
+```bash
+bash prepare_ann_table.sh
+```
+此脚本将创建并填充用于 ANN 测试的表。
+
+4. 运行查询测试
+```bash
+bash run_query.sh
+```
+此脚本将执行预定义的查询测试用例。
diff --git a/docker/lakesoul-ann-benchmark-env/compile_test_class.sh b/docker/lakesoul-ann-benchmark-env/compile_test_class.sh
new file mode 100644
index 000000000..42af7d8bf
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/compile_test_class.sh
@@ -0,0 +1,2 @@
+cd ../..
+JAVA_HOME=/usr/lib/jvm/java-1.11.0-openjdk-amd64 mvn package -DskipTests -Prelease-linux-x86-64
diff --git a/docker/lakesoul-ann-benchmark-env/conf/my.conf b/docker/lakesoul-ann-benchmark-env/conf/my.conf
new file mode 100644
index 000000000..8a4f2b1ac
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/conf/my.conf
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+[mysqld]
+pid-file = /var/run/mysqld/mysqld.pid
+socket = /var/run/mysqld/mysqld.sock
+datadir = /var/lib/mysql
+secure-file-priv= /data/tpch/
+# Disabling symbolic-links is recommended to prevent assorted security risks
+symbolic-links=0
+
+# Custom config should go here
+!includedir /etc/mysql/conf.d/
\ No newline at end of file
diff --git a/docker/lakesoul-ann-benchmark-env/conf/properties b/docker/lakesoul-ann-benchmark-env/conf/properties
new file mode 100644
index 000000000..dceb3d5d1
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/conf/properties
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# table_num 生成表数据量
+table_num=10
+# host mysql域名
+host=localhost
+# user: mysql账号; password: mysql密码; port: mysql端口号; db: mysql数据库名;
+user=root
+password=root
+port=3306
+db=test_cdc
+timezone=UTC
+# row_num随机向表中插入多少条数据
+row_num=500
+# delete_num随机删除多少条数据
+delete_num=50
diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/fairscheduler.xml.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/fairscheduler.xml.template
new file mode 100644
index 000000000..385b2e772
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/fairscheduler.xml.template
@@ -0,0 +1,31 @@
+
+
+
+
+
+
+ FAIR
+ 1
+ 2
+
+
+ FIFO
+ 2
+ 3
+
+
diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/log4j2.properties.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/log4j2.properties.template
new file mode 100644
index 000000000..ab96e03ba
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/log4j2.properties.template
@@ -0,0 +1,69 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+rootLogger.level = info
+rootLogger.appenderRef.stdout.ref = console
+
+# In the pattern layout configuration below, we specify an explicit `%ex` conversion
+# pattern for logging Throwables. If this was omitted, then (by default) Log4J would
+# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional
+# class packaging information. That extra information can sometimes add a substantial
+# performance overhead, so we disable it in our default logging config.
+# For more information, see SPARK-39361.
+appender.console.type = Console
+appender.console.name = console
+appender.console.target = SYSTEM_ERR
+appender.console.layout.type = PatternLayout
+appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex
+
+# Set the default spark-shell/spark-sql log level to WARN. When running the
+# spark-shell/spark-sql, the log level for these classes is used to overwrite
+# the root logger's log level, so that the user can have different defaults
+# for the shell and regular Spark apps.
+logger.repl.name = org.apache.spark.repl.Main
+logger.repl.level = warn
+
+logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver
+logger.thriftserver.level = warn
+
+# Settings to quiet third party logs that are too verbose
+logger.jetty1.name = org.sparkproject.jetty
+logger.jetty1.level = warn
+logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle
+logger.jetty2.level = error
+logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper
+logger.replexprTyper.level = info
+logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter
+logger.replSparkILoopInterpreter.level = info
+logger.parquet1.name = org.apache.parquet
+logger.parquet1.level = error
+logger.parquet2.name = parquet
+logger.parquet2.level = error
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler
+logger.RetryingHMSHandler.level = fatal
+logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry
+logger.FunctionRegistry.level = error
+
+# For deploying Spark ThriftServer
+# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805
+appender.console.filter.1.type = RegexFilter
+appender.console.filter.1.regex = .*Thrift error occurred during processing of message.*
+appender.console.filter.1.onMatch = deny
+appender.console.filter.1.onMismatch = neutral
diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/metrics.properties.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/metrics.properties.template
new file mode 100644
index 000000000..f52d33fd6
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/metrics.properties.template
@@ -0,0 +1,210 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# syntax: [instance].sink|source.[name].[options]=[value]
+
+# This file configures Spark's internal metrics system. The metrics system is
+# divided into instances which correspond to internal components.
+# Each instance can be configured to report its metrics to one or more sinks.
+# Accepted values for [instance] are "master", "worker", "executor", "driver",
+# and "applications". A wildcard "*" can be used as an instance name, in
+# which case all instances will inherit the supplied property.
+#
+# Within an instance, a "source" specifies a particular set of grouped metrics.
+# there are two kinds of sources:
+# 1. Spark internal sources, like MasterSource, WorkerSource, etc, which will
+# collect a Spark component's internal state. Each instance is paired with a
+# Spark source that is added automatically.
+# 2. Common sources, like JvmSource, which will collect low level state.
+# These can be added through configuration options and are then loaded
+# using reflection.
+#
+# A "sink" specifies where metrics are delivered to. Each instance can be
+# assigned one or more sinks.
+#
+# The sink|source field specifies whether the property relates to a sink or
+# source.
+#
+# The [name] field specifies the name of source or sink.
+#
+# The [options] field is the specific property of this source or sink. The
+# source or sink is responsible for parsing this property.
+#
+# Notes:
+# 1. To add a new sink, set the "class" option to a fully qualified class
+# name (see examples below).
+# 2. Some sinks involve a polling period. The minimum allowed polling period
+# is 1 second.
+# 3. Wildcard properties can be overridden by more specific properties.
+# For example, master.sink.console.period takes precedence over
+# *.sink.console.period.
+# 4. A metrics specific configuration
+# "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be
+# added to Java properties using -Dspark.metrics.conf=xxx if you want to
+# customize metrics system. You can also put the file in ${SPARK_HOME}/conf
+# and it will be loaded automatically.
+# 5. The MetricsServlet sink is added by default as a sink in the master,
+# worker and driver, and you can send HTTP requests to the "/metrics/json"
+# endpoint to get a snapshot of all the registered metrics in JSON format.
+# For master, requests to the "/metrics/master/json" and
+# "/metrics/applications/json" endpoints can be sent separately to get
+# metrics snapshots of the master instance and applications. This
+# MetricsServlet does not have to be configured.
+# 6. The metrics system can also be configured using Spark configuration
+# parameters. The relevant parameter names are formed by adding the
+# prefix "spark.metrics.conf." to the configuration entries detailed in
+# this file (see examples below).
+
+## List of available common sources and their properties.
+
+# org.apache.spark.metrics.source.JvmSource
+# Note: Currently, JvmSource is the only available common source.
+# It can be added to an instance by setting the "class" option to its
+# fully qualified class name (see examples below).
+
+## List of available sinks and their properties.
+
+# org.apache.spark.metrics.sink.ConsoleSink
+# Name: Default: Description:
+# period 10 Poll period
+# unit seconds Unit of the poll period
+
+# org.apache.spark.metrics.sink.CSVSink
+# Name: Default: Description:
+# period 10 Poll period
+# unit seconds Unit of the poll period
+# directory /tmp Where to store CSV files
+
+# org.apache.spark.metrics.sink.GangliaSink
+# Name: Default: Description:
+# host NONE Hostname or multicast group of the Ganglia server,
+# must be set
+# port NONE Port of the Ganglia server(s), must be set
+# period 10 Poll period
+# unit seconds Unit of the poll period
+# ttl 1 TTL of messages sent by Ganglia
+# dmax 0 Lifetime in seconds of metrics (0 never expired)
+# mode multicast Ganglia network mode ('unicast' or 'multicast')
+
+# org.apache.spark.metrics.sink.JmxSink
+
+# org.apache.spark.metrics.sink.MetricsServlet
+# Name: Default: Description:
+# path VARIES* Path prefix from the web server root
+# sample false Whether to show entire set of samples for histograms
+# ('false' or 'true')
+#
+# * Default path is /metrics/json for all instances except the master. The
+# master has two paths:
+# /metrics/applications/json # App information
+# /metrics/master/json # Master information
+
+# org.apache.spark.metrics.sink.PrometheusServlet
+# Name: Default: Description:
+# path VARIES* Path prefix from the web server root
+#
+# * Default path is /metrics/prometheus for all instances except the master. The
+# master has two paths:
+# /metrics/applications/prometheus # App information
+# /metrics/master/prometheus # Master information
+
+# org.apache.spark.metrics.sink.GraphiteSink
+# Name: Default: Description:
+# host NONE Hostname of the Graphite server, must be set
+# port NONE Port of the Graphite server, must be set
+# period 10 Poll period
+# unit seconds Unit of the poll period
+# prefix EMPTY STRING Prefix to prepend to every metric's name
+# protocol tcp Protocol ("tcp" or "udp") to use
+# regex NONE Optional filter to send only metrics matching this regex string
+
+# org.apache.spark.metrics.sink.StatsdSink
+# Name: Default: Description:
+# host 127.0.0.1 Hostname or IP of StatsD server
+# port 8125 Port of StatsD server
+# period 10 Poll period
+# unit seconds Units of poll period
+# prefix EMPTY STRING Prefix to prepend to metric name
+
+## Examples
+# Enable JmxSink for all instances by class name
+#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
+
+# Enable ConsoleSink for all instances by class name
+#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink
+
+# Enable StatsdSink for all instances by class name
+#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink
+#*.sink.statsd.prefix=spark
+
+# Polling period for the ConsoleSink
+#*.sink.console.period=10
+# Unit of the polling period for the ConsoleSink
+#*.sink.console.unit=seconds
+
+# Polling period for the ConsoleSink specific for the master instance
+#master.sink.console.period=15
+# Unit of the polling period for the ConsoleSink specific for the master
+# instance
+#master.sink.console.unit=seconds
+
+# Enable CsvSink for all instances by class name
+#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink
+
+# Polling period for the CsvSink
+#*.sink.csv.period=1
+# Unit of the polling period for the CsvSink
+#*.sink.csv.unit=minutes
+
+# Polling directory for CsvSink
+#*.sink.csv.directory=/tmp/
+
+# Polling period for the CsvSink specific for the worker instance
+#worker.sink.csv.period=10
+# Unit of the polling period for the CsvSink specific for the worker instance
+#worker.sink.csv.unit=minutes
+
+# Enable Slf4jSink for all instances by class name
+#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink
+
+# Polling period for the Slf4JSink
+#*.sink.slf4j.period=1
+# Unit of the polling period for the Slf4jSink
+#*.sink.slf4j.unit=minutes
+
+# Example configuration for Graphite sink
+#*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink
+#*.sink.graphite.host=
+#*.sink.graphite.port=
+#*.sink.graphite.period=10
+#*.sink.graphite.unit=seconds
+#*.sink.graphite.prefix=
+
+# Enable JvmSource for instance master, worker, driver and executor
+#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+# Example configuration for PrometheusServlet
+#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
+#*.sink.prometheusServlet.path=/metrics/prometheus
+#master.sink.prometheusServlet.path=/metrics/master/prometheus
+#applications.sink.prometheusServlet.path=/metrics/applications/prometheus
diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf
new file mode 100755
index 000000000..844709658
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf
@@ -0,0 +1,26 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+spark.sql.extensions com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension
+spark.sql.catalog.lakesoul org.apache.spark.sql.lakesoul.catalog.LakeSoulCatalog
+# spark.sql.defaultCatalog lakesoul
+# spark.eventLog.enabled true
+# spark.sql.catalogImplementation in-memory
diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf.template
new file mode 100644
index 000000000..19cba6e71
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf.template
@@ -0,0 +1,27 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master spark://master:7077
+# spark.eventLog.enabled true
+# spark.eventLog.dir hdfs://namenode:8021/directory
+# spark.serializer org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory 5g
+# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-env.sh.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-env.sh.template
new file mode 100755
index 000000000..e9491995e
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-env.sh.template
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
+# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in any mode
+# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
+
+# Options read in any cluster manager using HDFS
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+
+# Options read in YARN client/cluster mode
+# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN
+
+# Options for the daemons used in the standalone deploy mode
+# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+# Options for launcher
+# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y")
+
+# Generic options for the daemons used in the standalone deploy mode
+# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs)
+# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5.
+# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp)
+# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER)
+# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0)
+# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file.
+# Options for native BLAS, like Intel MKL, OpenBLAS, and so on.
+# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
+# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL
+# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS
+
+# Options for beeline
+# - SPARK_BEELINE_OPTS, to set config properties only for the beeline cli (e.g. "-Dx=y")
+# - SPARK_BEELINE_MEMORY, Memory for beeline (e.g. 1000M, 2G) (Default: 1G)
diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/workers.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/workers.template
new file mode 100644
index 000000000..be42a6382
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/workers.template
@@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# A Spark Worker will be started on each of the machines listed below.
+localhost
\ No newline at end of file
diff --git a/docker/lakesoul-ann-benchmark-env/docker-compose.env b/docker/lakesoul-ann-benchmark-env/docker-compose.env
new file mode 100644
index 000000000..9485df04a
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/docker-compose.env
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+PG_PORT=29013
+MYSQL_PORT=29014
+MINIO_UI_PORT=29002
+MINIO_API_PORT=29001
+DORIS_QUERY_PORT=29031
+FLINK_JOBMANAGER_PORT=28081
+LAKESOUL_META_HOST=lakesoul-meta-pg
\ No newline at end of file
diff --git a/docker/lakesoul-ann-benchmark-env/docker-compose.yml b/docker/lakesoul-ann-benchmark-env/docker-compose.yml
new file mode 100644
index 000000000..f687d044b
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/docker-compose.yml
@@ -0,0 +1,87 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+version: '3'
+
+services:
+ postgres:
+ image: postgres:14.5
+ container_name: lakesoul-ann-pg
+ hostname: ${LAKESOUL_META_HOST}
+ networks:
+ - lakesoul-ann
+ ports:
+ - "${PG_PORT}:5432"
+ restart: always
+ environment:
+ POSTGRES_PASSWORD: lakesoul_test
+ POSTGRES_USER: lakesoul_test
+ POSTGRES_DB: lakesoul_test
+ command:
+ - "postgres"
+ - "-c"
+ - "max_connections=4096"
+ - "-c"
+ - "default_transaction_isolation=serializable"
+ volumes:
+ - ./sql/meta_init.sql:/docker-entrypoint-initdb.d/meta_init.sql
+ - ./sql/meta_cleanup.sql:/meta_cleanup.sql
+
+ minio:
+ image: bitnami/minio:latest
+ container_name: lakesoul-ann-minio
+ ports:
+ - ${MINIO_API_PORT}:9000
+ - ${MINIO_UI_PORT}:9001
+ environment:
+ MINIO_ROOT_USER: admin
+ MINIO_ROOT_PASSWORD: password
+ hostname: minio
+ networks:
+ - lakesoul-ann
+ command: /opt/bitnami/minio/bin/minio server /data --console-address ":9001"
+
+ spark:
+ image: bitnami/spark:3.3.1
+ container_name: lakesoul-ann-spark
+ privileged: true
+ user: root
+ volumes:
+ - ./packages/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT.jar:/opt/bitnami/spark/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT.jar
+ # - ./packages/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT-tests.jar:/opt/bitnami/spark/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT-tests.jar
+ - ./data/spark-conf:/opt/bitnami/spark/conf
+ - ./sql/prepare_ann_benchmark.sql:/opt/sql/prepare_ann_benchmark.sql
+ - ./data/embeddings:/data/embeddings
+ - ./python:/opt/bitnami/spark/python
+ depends_on:
+ - postgres
+ - minio
+ environment:
+ - AWS_ACCESS_KEY_ID=admin
+ - AWS_SECRET_ACCESS_KEY=password
+ - AWS_REGION=us-east-1
+ - LAKESOUL_PG_DRIVER=com.lakesoul.shaded.org.postgresql.Driver
+ - LAKESOUL_PG_URL=jdbc:postgresql://${LAKESOUL_META_HOST}:5432/lakesoul_test?stringtype=unspecified
+ - LAKESOUL_PG_USERNAME=lakesoul_test
+ - LAKESOUL_PG_PASSWORD=lakesoul_test
+ networks:
+ - lakesoul-ann
+
+networks:
+ lakesoul-ann:
+ ipam:
+ driver: default
\ No newline at end of file
diff --git a/docker/lakesoul-ann-benchmark-env/init.log b/docker/lakesoul-ann-benchmark-env/init.log
new file mode 100644
index 000000000..5880c4827
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/init.log
@@ -0,0 +1,339 @@
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/05 05:55:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/05 05:56:08 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/05 05:56:08 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/05 05:56:16 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/05 05:56:16 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.19.0.7
+24/11/05 05:56:17 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1730786161919
+Time taken: 8.551 seconds
+24/11/05 05:56:27 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 5.61 seconds
+Time taken: 2.213 seconds
+Time taken: 3.564 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/05 06:34:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/05 06:34:38 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/05 06:34:38 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/05 06:34:46 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/05 06:34:46 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.21.0.6
+24/11/05 06:34:46 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1730788472466
+Time taken: 5.694 seconds
+24/11/05 06:34:54 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 2.316 seconds
+Time taken: 1.887 seconds
+Time taken: 3.603 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/05 06:38:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/05 06:38:53 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/05 06:38:53 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/05 06:39:08 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/05 06:39:08 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.25.0.7
+24/11/05 06:39:08 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1730788731559
+Time taken: 6.553 seconds
+24/11/05 06:39:16 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 2.3 seconds
+Time taken: 5.505 seconds
+Time taken: 4.333 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/05 06:47:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/05 06:47:44 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/05 06:47:44 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/05 06:48:01 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/05 06:48:01 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.27.0.6
+24/11/05 06:48:01 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1730789262128
+Time taken: 6.079 seconds
+24/11/05 06:48:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 2.323 seconds
+Time taken: 2.136 seconds
+Time taken: 3.613 seconds
+^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[DSetting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D24/11/06 02:37:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D24/11/06 02:37:12 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+^[[D24/11/06 02:37:12 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D24/11/06 02:37:28 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/06 02:37:28 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.28.0.6
+24/11/06 02:37:28 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1730860628015
+Time taken: 4.296 seconds
+24/11/06 02:37:33 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 2.318 seconds
+Time taken: 2.348 seconds
+Time taken: 3.657 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/06 02:55:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/06 02:55:37 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/06 02:55:37 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/06 02:55:47 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/06 02:55:47 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.29.0.6
+24/11/06 02:55:47 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1730861730302
+Time taken: 5.277 seconds
+24/11/06 02:55:53 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 2.206 seconds
+Time taken: 2.002 seconds
+Time taken: 3.546 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/06 09:59:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/06 10:00:01 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/06 10:00:01 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/06 10:00:21 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/06 10:00:21 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.18.0.6
+24/11/06 10:00:21 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1730887194895
+Time taken: 3.746 seconds
+24/11/06 10:00:26 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 2.08 seconds
+Time taken: 2.016 seconds
+Time taken: 3.352 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/07 06:44:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/07 06:45:03 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/07 06:45:03 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/07 06:45:06 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/07 06:45:06 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.18.0.6
+Spark master: local[*], Application Id: local-1730961900954
+Time taken: 3.595 seconds
+Time taken: 0.112 seconds
+Time taken: 2.173 seconds
+24/11/07 06:45:12 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 6.16 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/08 05:28:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/08 05:28:18 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 05:28:18 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/08 05:28:23 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/08 05:28:23 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.176.6
+24/11/08 05:28:23 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731043693667
+24/11/08 05:28:29 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/08 05:28:29 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/08 05:28:29 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/08 05:28:29 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 05:28:29 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 6.226 seconds
+24/11/08 05:28:30 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+Time taken: 0.068 seconds
+Time taken: 0.998 seconds
+24/11/08 05:28:31 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 2.009 seconds
+Time taken: 6.05 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/08 05:38:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/08 05:38:26 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 05:38:26 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/08 05:38:39 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/08 05:38:39 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.192.6
+24/11/08 05:38:39 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731044301051
+24/11/08 05:38:42 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/08 05:38:42 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/08 05:38:42 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/08 05:38:42 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 05:38:42 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 3.719 seconds
+24/11/08 05:38:43 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+Time taken: 0.048 seconds
+Time taken: 1.003 seconds
+24/11/08 05:38:44 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 1.875 seconds
+Time taken: 6.24 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/08 05:47:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/08 05:47:45 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 05:47:45 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/08 05:48:03 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/08 05:48:03 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.208.7
+24/11/08 05:48:03 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731044862868
+24/11/08 05:48:07 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/08 05:48:07 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/08 05:48:08 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/08 05:48:08 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 05:48:08 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 4.625 seconds
+24/11/08 05:48:08 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+Time taken: 0.057 seconds
+Time taken: 1.021 seconds
+24/11/08 05:48:09 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 1.912 seconds
+Time taken: 6.344 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/08 05:53:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/08 05:53:16 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 05:53:16 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/08 05:53:36 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/08 05:53:36 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.224.6
+24/11/08 05:53:36 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731045194239
+24/11/08 05:53:39 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/08 05:53:39 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/08 05:53:40 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/08 05:53:40 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 05:53:40 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 3.942 seconds
+24/11/08 05:53:40 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+Time taken: 0.051 seconds
+Time taken: 1.083 seconds
+24/11/08 05:53:41 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 1.848 seconds
+Time taken: 6.239 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/08 05:59:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/08 05:59:13 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 05:59:13 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/08 05:59:20 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/08 05:59:20 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.240.7
+24/11/08 05:59:20 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731045549352
+24/11/08 05:59:26 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/08 05:59:26 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/08 05:59:26 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/08 05:59:26 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 05:59:26 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 7.186 seconds
+24/11/08 05:59:28 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+Time taken: 0.056 seconds
+Time taken: 1.013 seconds
+24/11/08 05:59:29 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 2.023 seconds
+Time taken: 6.129 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/08 06:18:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/08 06:18:51 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 06:18:51 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/08 06:19:09 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/08 06:19:09 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.18.0.6
+24/11/08 06:19:09 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731046729488
+24/11/08 06:19:12 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/08 06:19:12 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/08 06:19:12 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/08 06:19:12 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 06:19:12 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 3.749 seconds
+24/11/08 06:19:13 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+Time taken: 0.049 seconds
+Time taken: 1.044 seconds
+24/11/08 06:19:14 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 1.862 seconds
+Time taken: 6.531 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/08 06:48:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/08 06:48:35 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 06:48:35 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/08 06:48:48 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/08 06:48:48 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.23.0.6
+24/11/08 06:48:48 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731048511456
+24/11/08 06:48:52 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/08 06:48:52 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/08 06:48:52 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/08 06:48:52 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 06:48:52 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 4.072 seconds
+24/11/08 06:48:53 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+Time taken: 0.056 seconds
+Time taken: 0.989 seconds
+24/11/08 06:48:54 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 1.902 seconds
+Time taken: 6.396 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/08 07:04:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/08 07:04:26 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 07:04:26 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/08 07:04:40 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/08 07:04:40 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.27.0.7
+24/11/08 07:04:40 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731049462352
+24/11/08 07:04:44 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/08 07:04:44 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/08 07:04:44 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/08 07:04:44 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 07:04:44 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 3.962 seconds
+24/11/08 07:04:45 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+24/11/08 07:04:45 WARN ObjectStore: Failed to get database lakesoul, returning NoSuchObjectException
+Error in query: Database 'lakesoul' not found
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/08 07:20:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/08 07:20:40 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 07:20:40 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/08 07:20:49 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/08 07:20:49 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.29.0.6
+24/11/08 07:20:49 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731050436434
+24/11/08 07:20:54 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/08 07:20:55 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/08 07:20:55 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/08 07:20:55 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/08 07:20:55 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 7.189 seconds
+24/11/08 07:20:57 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+Time taken: 0.048 seconds
+Time taken: 1.01 seconds
+24/11/08 07:20:58 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 1.993 seconds
+Time taken: 6.337 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/13 10:15:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/13 10:15:36 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/13 10:15:36 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/13 10:15:50 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/13 10:15:50 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.31.0.6
+24/11/13 10:15:50 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731492931846
+24/11/13 10:15:53 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/13 10:15:53 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/13 10:15:54 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/13 10:15:54 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/13 10:15:54 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 3.95 seconds
+24/11/13 10:15:54 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+Time taken: 0.048 seconds
+Time taken: 1.047 seconds
+24/11/13 10:15:56 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 2.033 seconds
+Time taken: 5.933 seconds
+Setting default log level to "WARN".
+To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
+24/11/15 02:12:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+24/11/15 02:12:50 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/15 02:12:50 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+24/11/15 02:13:02 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
+24/11/15 02:13:02 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.16.6
+24/11/15 02:13:02 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
+Spark master: local[*], Application Id: local-1731636766381
+24/11/15 02:13:06 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+24/11/15 02:13:06 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
+24/11/15 02:13:06 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
+24/11/15 02:13:06 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
+24/11/15 02:13:06 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
+Time taken: 4.108 seconds
+24/11/15 02:13:07 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+Time taken: 0.053 seconds
+Time taken: 1.004 seconds
+24/11/15 02:13:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
+Time taken: 1.95 seconds
+Time taken: 6.336 seconds
diff --git a/docker/lakesoul-ann-benchmark-env/prepare_ann_table.sh b/docker/lakesoul-ann-benchmark-env/prepare_ann_table.sh
new file mode 100644
index 000000000..74cc6c2c4
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/prepare_ann_table.sh
@@ -0,0 +1,15 @@
+echo "Start prepare data for LSH ANN benchmark..."
+
+docker exec -it lakesoul-ann-spark spark-submit \
+ --class com.dmetasoul.lakesoul.spark.ann.LocalSensitiveHashANN \
+ --conf spark.sql.extensions=com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension \
+ --conf spark.hadoop.fs.s3.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
+ --conf spark.hadoop.fs.s3a.buffer.dir=/opt/spark/work-dir/s3a \
+ --conf spark.hadoop.fs.s3a.path.style.access=true \
+ --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000 \
+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \
+ --conf spark.hadoop.fs.s3a.access.key=admin \
+ --conf spark.hadoop.fs.s3a.secret.key=password \
+ --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \
+ --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
+ /opt/bitnami/spark/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT.jar load --hdf5-file /data/embeddings/fashion-mnist-784-euclidean.hdf5 --warehouse s3://lakesoul-test-bucket --table-name ann_table --embedding-dim 784 --bit-width 512
\ No newline at end of file
diff --git a/docker/lakesoul-ann-benchmark-env/run_query.sh b/docker/lakesoul-ann-benchmark-env/run_query.sh
new file mode 100644
index 000000000..5f95e53ec
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/run_query.sh
@@ -0,0 +1,15 @@
+echo "Start query data for LSH ANN benchmark..."
+
+docker exec -it lakesoul-ann-spark spark-submit \
+ --class com.dmetasoul.lakesoul.spark.ann.LocalSensitiveHashANN \
+ --conf spark.sql.extensions=com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension \
+ --conf spark.hadoop.fs.s3.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
+ --conf spark.hadoop.fs.s3a.buffer.dir=/opt/spark/work-dir/s3a \
+ --conf spark.hadoop.fs.s3a.path.style.access=true \
+ --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000 \
+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \
+ --conf spark.hadoop.fs.s3a.access.key=admin \
+ --conf spark.hadoop.fs.s3a.secret.key=password \
+ --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \
+ --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
+ /opt/bitnami/spark/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT.jar query --hdf5-file /data/embeddings/fashion-mnist-784-euclidean.hdf5 --table-name ann_table --query-limit 100 --topk 100 --n-factor 3
\ No newline at end of file
diff --git a/docker/lakesoul-ann-benchmark-env/sql/meta_cleanup.sql b/docker/lakesoul-ann-benchmark-env/sql/meta_cleanup.sql
new file mode 100644
index 000000000..dd25a7ee9
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/sql/meta_cleanup.sql
@@ -0,0 +1,11 @@
+-- SPDX-FileCopyrightText: 2023 LakeSoul Contributors
+--
+-- SPDX-License-Identifier: Apache-2.0
+
+delete from namespace;
+insert into namespace(namespace, properties, comment) values ('default', '{}', '');
+delete from data_commit_info;
+delete from table_info;
+delete from table_path_id;
+delete from table_name_id;
+delete from partition_info;
diff --git a/docker/lakesoul-ann-benchmark-env/sql/meta_init.sql b/docker/lakesoul-ann-benchmark-env/sql/meta_init.sql
new file mode 100644
index 000000000..28b4eca4b
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/sql/meta_init.sql
@@ -0,0 +1,145 @@
+-- SPDX-FileCopyrightText: 2023 LakeSoul Contributors
+--
+-- SPDX-License-Identifier: Apache-2.0
+
+create table if not exists namespace
+(
+ namespace text,
+ properties json,
+ comment text,
+ domain text default 'public',
+ primary key (namespace)
+);
+
+insert into namespace(namespace, properties, comment) values ('default', '{}', '')
+ON CONFLICT DO NOTHING;
+
+create table if not exists table_info
+(
+ table_id text,
+ table_namespace text default 'default',
+ table_name text,
+ table_path text,
+ table_schema text,
+ properties json,
+ partitions text,
+ domain text default 'public',
+ primary key (table_id)
+);
+
+create table if not exists table_name_id
+(
+ table_name text,
+ table_id text,
+ table_namespace text default 'default',
+ domain text default 'public',
+ primary key (table_name, table_namespace)
+);
+
+create table if not exists table_path_id
+(
+ table_path text,
+ table_id text,
+ table_namespace text default 'default',
+ domain text default 'public',
+ primary key (table_path)
+);
+
+DO
+$$
+ BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'data_file_op') THEN
+ create type data_file_op as
+ (
+ path text,
+ file_op text,
+ size bigint,
+ file_exist_cols text
+ );
+ END IF;
+ END
+$$;
+
+create table if not exists data_commit_info
+(
+ table_id text,
+ partition_desc text,
+ commit_id UUID,
+ file_ops data_file_op[],
+ commit_op text,
+ committed boolean default 'false',
+ timestamp bigint,
+ domain text default 'public',
+ primary key (table_id, partition_desc, commit_id)
+);
+
+create table if not exists partition_info
+(
+ table_id text,
+ partition_desc text,
+ version int,
+ commit_op text,
+ timestamp bigint DEFAULT (date_part('epoch'::text, now()) * (1000)::double precision),
+ snapshot UUID[],
+ expression text,
+ domain text default 'public',
+ primary key (table_id, partition_desc, version)
+);
+
+CREATE OR REPLACE FUNCTION partition_insert() RETURNS TRIGGER AS
+$$
+DECLARE
+ rs_version integer;
+ rs_table_path text;
+ rs_table_namespace text;
+BEGIN
+ if NEW.commit_op <> 'CompactionCommit' then
+ select version
+ INTO rs_version
+ from partition_info
+ where table_id = NEW.table_id
+ and partition_desc = NEW.partition_desc
+ and version != NEW.version
+ and commit_op = 'CompactionCommit'
+ order by version desc
+ limit 1;
+ if rs_version >= 0 then
+ if NEW.version - rs_version >= 3 then
+ select table_path, table_namespace
+ into rs_table_path, rs_table_namespace
+ from table_info
+ where table_id = NEW.table_id;
+ perform pg_notify('lakesoul_compaction_notify',
+ concat('{"table_path":"', rs_table_path, '","table_partition_desc":"',
+ NEW.partition_desc, '","table_namespace":"', rs_table_namespace, '"}'));
+ end if;
+ else
+ if NEW.version >= 2 then
+ select table_path, table_namespace
+ into rs_table_path, rs_table_namespace
+ from table_info
+ where table_id = NEW.table_id;
+ perform pg_notify('lakesoul_compaction_notify',
+ concat('{"table_path":"', rs_table_path, '","table_partition_desc":"',
+ NEW.partition_desc, '","table_namespace":"', rs_table_namespace, '"}'));
+ end if;
+ end if;
+ RETURN NULL;
+ end if;
+ RETURN NULL;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE TRIGGER partition_table_change
+ AFTER INSERT
+ ON partition_info
+ FOR EACH ROW
+EXECUTE PROCEDURE partition_insert();
+
+create table if not exists global_config
+(
+ key text,
+ value text,
+ primary key (key)
+);
+
diff --git a/docker/lakesoul-ann-benchmark-env/start_all.sh b/docker/lakesoul-ann-benchmark-env/start_all.sh
new file mode 100644
index 000000000..b3dbc197e
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/start_all.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+
+set -e
+
+LAKESOUL_VERSION=2.6.2-SNAPSHOT
+SPARK_LAKESOUL_JAR=lakesoul-spark-3.3-${LAKESOUL_VERSION}.jar
+SPARK_LAKESOUL_TEST_JAR=lakesoul-spark-3.3-${LAKESOUL_VERSION}-tests.jar
+
+download_hdf5_file() {
+ local FILE_NAME=$1
+ local EXPECTED_MD5=$2
+ local DOWNLOAD_URL=$3
+
+ if [[ -f ${FILE_NAME} ]]; then
+ local FILE_MD5
+ echo "compare md5 of ${FILE_NAME} with ${EXPECTED_MD5}"
+ FILE_MD5=$(md5sum ${FILE_NAME} | awk '{print $1}')
+ if [[ "${FILE_MD5}" != "${EXPECTED_MD5}" ]]; then
+ echo "md5 not match, expected: ${EXPECTED_MD5}, actual: ${FILE_MD5}"
+ rm "${FILE_NAME}"
+ wget "${DOWNLOAD_URL}/${FILE_NAME}"
+ fi
+ else
+ echo "download ${FILE_NAME} "
+ wget "${DOWNLOAD_URL}/${FILE_NAME}"
+ fi
+}
+
+mkdir -p data/embeddings
+
+download_hdf5_file "fashion-mnist-784-euclidean.hdf5" "23249362dbd58a321c05b1a85275adba" "http://ann-benchmarks.com"
+cp fashion-mnist-784-euclidean.hdf5 data/embeddings/
+
+# echo "Prepare python environment..."
+# if [ ! -d ".venv" ]; then
+# python -m venv .venv
+# source .venv/bin/activate
+# # 在本地安装必要的Python包
+# pip install -r requirements.txt
+# else
+# source .venv/bin/activate
+# fi
+
+mkdir -p packages/jars
+
+cp ../../lakesoul-spark/target/${SPARK_LAKESOUL_JAR} packages/jars/
+cp ../../lakesoul-spark/target/${SPARK_LAKESOUL_TEST_JAR} packages/jars/
+
+
+echo "Start docker-compose..."
+docker compose -f docker-compose.yml --env-file docker-compose.env up -d
+
+echo "Waiting for MinIO to be ready..."
+sleep 10 # 给 MinIO 一些启动时间
+
+# 创建 bucket
+docker exec lakesoul-ann-minio mc alias set local http://localhost:9000 admin password
+docker exec lakesoul-ann-minio mc mb local/lakesoul-test-bucket
+docker exec lakesoul-ann-minio mc policy set public local/lakesoul-test-bucket
+
+
+# 首先将 Python 脚本和虚拟环境复制到容器中
+# docker cp .venv/lib/python3.11/site-packages lakesoul-ann-spark:/opt/bitnami/python/lib/python3.8
+
+# bash prepare_ann_table.sh
+
+echo "============================================================================="
+echo "Success to launch LSH ANN benchmark environment!"
+echo "You can:"
+echo " 'bash start_spark_sql.sh' to login into spark"
+echo "============================================================================="
+
diff --git a/docker/lakesoul-ann-benchmark-env/start_spark_sql.sh b/docker/lakesoul-ann-benchmark-env/start_spark_sql.sh
new file mode 100644
index 000000000..e7e7e5487
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/start_spark_sql.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+sudo docker exec -it lakesoul-ann-spark spark-sql \
+ --conf spark.sql.extensions=com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension \
+ --conf spark.hadoop.fs.s3.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
+ --conf spark.hadoop.fs.s3a.buffer.dir=/opt/spark/work-dir/s3a \
+ --conf spark.hadoop.fs.s3a.path.style.access=true \
+ --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000 \
+ --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider \
+ --conf spark.hadoop.fs.s3a.access.key=admin \
+ --conf spark.hadoop.fs.s3a.secret.key=password
\ No newline at end of file
diff --git a/docker/lakesoul-ann-benchmark-env/stop_all.sh b/docker/lakesoul-ann-benchmark-env/stop_all.sh
new file mode 100644
index 000000000..63c244756
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/stop_all.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+sudo docker compose -f docker-compose.yml --env-file docker-compose.env down
diff --git a/docker/lakesoul-ann-benchmark-env/work-dir/lakesoul.properties b/docker/lakesoul-ann-benchmark-env/work-dir/lakesoul.properties
new file mode 100644
index 000000000..a37fffa1a
--- /dev/null
+++ b/docker/lakesoul-ann-benchmark-env/work-dir/lakesoul.properties
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+lakesoul.pg.driver=com.lakesoul.shaded.org.postgresql.Driver
+lakesoul.pg.url=jdbc:postgresql://doris-lakesoul-docker-compose-env-lakesoul-meta-db-1:5432/lakesoul_test?stringtype=unspecified
+lakesoul.pg.username=lakesoul_test
+lakesoul.pg.password=lakesoul_test
diff --git a/lakesoul-spark/pom.xml b/lakesoul-spark/pom.xml
index f880dd64d..ecb52d619 100644
--- a/lakesoul-spark/pom.xml
+++ b/lakesoul-spark/pom.xml
@@ -513,6 +513,7 @@ SPDX-License-Identifier: Apache-2.0
org.postgresql:postgresql
com.alibaba:fastjson
mysql:mysql-connector-java
+ io.jhdf:jhdf
org.casbin:jdbc-adapter
diff --git a/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ParametersTool.java b/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ParametersTool.java
index 38ab69faf..c0c27e1fc 100644
--- a/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ParametersTool.java
+++ b/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ParametersTool.java
@@ -364,4 +364,13 @@ protected void addToDefaults(String key, String value) {
}
}
+
+ @Override
+ public String toString() {
+ return "ParametersTool{" +
+ "data=" + data +
+ ", defaultData=" + defaultData +
+ ", unrequestedParameters=" + unrequestedParameters +
+ '}';
+ }
}
\ No newline at end of file
diff --git a/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ann/LocalSensitiveHashANN.scala b/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ann/LocalSensitiveHashANN.scala
new file mode 100644
index 000000000..4664f24a9
--- /dev/null
+++ b/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ann/LocalSensitiveHashANN.scala
@@ -0,0 +1,328 @@
+package com.dmetasoul.lakesoul.spark.ann
+
+import com.dmetasoul.lakesoul.spark.ParametersTool
+import com.dmetasoul.lakesoul.tables.LakeSoulTable
+import io.jhdf.HdfFile
+import org.apache.spark.sql.functions.{collect_list, udf}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.lakesoul.LakeSoulOptions
+import org.apache.spark.sql.lakesoul.catalog.LakeSoulCatalog
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{Row, SparkSession}
+
+import java.nio.file.Paths
+import scala.concurrent.duration.DurationLong
+import scala.math.{pow, sqrt}
+
+object LocalSensitiveHashANN {
+
+ case class LoadConfig(
+ hdf5File: String = "",
+ warehouse: String = "",
+ tableName: String = "ann_table",
+ embeddingDim: Int = 0,
+ bitWidth: Int = 512,
+ )
+
+ case class QueryConfig(
+ hdf5File: String = "",
+ tableName: String = "",
+ queryLimit: Int = 100,
+ topk: Int = 100,
+ nFactor: Int = 3
+ )
+
+ sealed trait Command
+ case class Load(config: LoadConfig) extends Command
+ case class Query(config: QueryConfig) extends Command
+
+ def parseArgs(args: Array[String]): Option[Command] = {
+ if (args.isEmpty) {
+ printUsage()
+ return None
+ }
+
+ args(0).toLowerCase match {
+ case "load" => parseLoadCommand(args.tail)
+ case "query" => parseQueryCommand(args.tail)
+ case _ =>
+ println(s"未知命令: ${args(0)}")
+ printUsage()
+ None
+ }
+ }
+
+ private def parseLoadCommand(args: Array[String]): Option[Command] = {
+ var config = LoadConfig()
+ var i = 0
+ while (i < args.length) {
+ args(i) match {
+ case "--hdf5-file" if i + 1 < args.length =>
+ config = config.copy(hdf5File = args(i + 1))
+ i += 2
+ case "--warehouse" if i + 1 < args.length =>
+ config = config.copy(warehouse = args(i + 1))
+ i += 2
+ case "--table-name" if i + 1 < args.length =>
+ config = config.copy(tableName = args(i + 1))
+ i += 2
+ case "--embedding-dim" if i + 1 < args.length =>
+ config = config.copy(embeddingDim = args(i + 1).toInt)
+ i += 2
+ case "--bit-width" if i + 1 < args.length =>
+ config = config.copy(bitWidth = args(i + 1).toInt)
+ i += 2
+ case arg =>
+ println(s"未知的load参数: $arg")
+ printUsage()
+ return None
+ }
+ }
+
+ if (config.hdf5File.isEmpty || config.warehouse.isEmpty || config.embeddingDim <= 0) {
+ println("缺少必需参数: --hdf5-file 和 --warehouse 和 --embedding-dim")
+ printUsage()
+ None
+ } else {
+ Some(Load(config))
+ }
+ }
+
+ private def parseQueryCommand(args: Array[String]): Option[Command] = {
+ var config = QueryConfig()
+ var i = 0
+ while (i < args.length) {
+ args(i) match {
+ case "--hdf5-file" if i + 1 < args.length =>
+ config = config.copy(hdf5File = args(i + 1))
+ i += 2
+ case "--table-name" if i + 1 < args.length =>
+ config = config.copy(tableName = args(i + 1))
+ i += 2
+ case "--query-limit" if i + 1 < args.length =>
+ config = config.copy(queryLimit = args(i + 1).toInt)
+ i += 2
+ case "--topk" if i + 1 < args.length =>
+ config = config.copy(topk = args(i + 1).toInt)
+ i += 2
+ case "--n-factor" if i + 1 < args.length =>
+ config = config.copy(nFactor = args(i + 1).toInt)
+ i += 2
+ case arg =>
+ println(s"未知的query参数: $arg")
+ printUsage()
+ return None
+ }
+ }
+
+ if (config.tableName.isEmpty) {
+ println("缺少必需参数: --table-name")
+ printUsage()
+ None
+ } else {
+ Some(Query(config))
+ }
+ }
+
+ private def printUsage(): Unit = {
+ println(
+ """用法:
+ | load命令:
+ | load --hdf5-file --warehouse [--table-name ]
+ |
+ | query命令:
+ | query --hdf5-file --table-name [--query-limit ] [--topk ] [--n-factor ]
+ |
+ |参数说明:
+ | load命令参数:
+ | --hdf5-file : HDF5文件路径(必需)
+ | --warehouse : LakeSoul仓库路径(必需)
+ | --table-name : 表名(可选,默认:ann_table)
+ | --embedding-dim: 嵌入维度(必需,要求与HDF5文件中的embedding维度一致)
+ | --bit-width : 位宽(可选,默认:512)
+ |
+ | query命令参数:
+ | --hdf5-file : HDF5文件路径(必需)
+ | --table-name : 表名(必需)
+ | --query-limit : 查询数据量(可选,默认:100)
+ | --topk : 返回最近邻的数量(可选,默认:100)
+ | --n-factor : LSH第一阶段过滤倍数(可选,默认:3)
+ |""".stripMargin)
+ }
+
+
+ def main(args: Array[String]): Unit = {
+ val builder = SparkSession.builder()
+ .appName("LocalSensitiveHashANN")
+ .master("local[1]")
+ .config("spark.sql.parquet.mergeSchema", value = true)
+ .config("spark.sql.parquet.filterPushdown", value = true)
+ .config("spark.sql.extensions", "com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension")
+ .config("spark.sql.catalog.lakesoul", classOf[LakeSoulCatalog].getName)
+ .config(SQLConf.DEFAULT_CATALOG.key, LakeSoulCatalog.CATALOG_NAME)
+
+ val spark = builder.getOrCreate()
+ spark.sparkContext.setLogLevel("WARN")
+
+ val command = parseArgs(args)
+ command match {
+ case Some(Load(config)) =>
+ val hdfFile = new HdfFile(Paths.get(config.hdf5File))
+
+ val trainDataset = hdfFile.getDatasetByPath("train")
+ val testDataset = hdfFile.getDatasetByPath("test")
+ val trainData = trainDataset.getData()
+ val testData = testDataset.getData()
+
+ val schema = StructType(Array(
+ StructField("IndexId", IntegerType, true),
+ StructField("Embedding", ArrayType(FloatType), true, new MetadataBuilder()
+ .putString(LakeSoulOptions.SchemaFieldMetadata.LSH_EMBEDDING_DIMENSION, config.embeddingDim.toString)
+ .putString(LakeSoulOptions.SchemaFieldMetadata.LSH_BIT_WIDTH, config.bitWidth.toString)
+ .putString(LakeSoulOptions.SchemaFieldMetadata.LSH_RNG_SEED, "1234567890").build()),
+ StructField("Embedding_LSH", ArrayType(LongType), true),
+ StructField("Split", StringType, true)
+ ))
+ trainData match {
+ case float2DData: Array[Array[Float]] =>
+ val classIds = (1 to float2DData.length).toArray
+
+ val rows = float2DData.zip(classIds).map {
+ case (embedding, indexId) =>
+ // Row(indexId, embedding)
+ Row(indexId, embedding, null, "train")
+ }
+ val df = spark.createDataFrame(spark.sparkContext.parallelize(rows), schema)
+ df.write.format("lakesoul")
+ .option("hashPartitions", "IndexId")
+ .option("hashBucketNum", 4)
+ .option(LakeSoulOptions.SHORT_TABLE_NAME, config.tableName)
+ .mode("Overwrite").save(s"${config.warehouse}/${config.tableName}")
+ case _ =>
+ println("Invalid train data")
+ }
+
+
+ testData match {
+ case float2DTestData: Array[Array[Float]] =>
+ val classIdsTest = (1 to float2DTestData.length).toArray
+
+ val rowsTest = float2DTestData.zip(classIdsTest).map {
+ case (embedding, indexId) =>
+ // Row(indexId, embedding)
+ Row(indexId, embedding, null, "test")
+ }
+
+ // val num = 50
+ val dfTest = spark.createDataFrame(spark.sparkContext.parallelize(rowsTest), schema)
+ LakeSoulTable.forPath(s"${config.warehouse}/${config.tableName}").upsert(dfTest)
+
+ case _ =>
+ println("Invalid test data")
+ }
+
+
+ case Some(Query(config)) =>
+ val hdfFile = new HdfFile(Paths.get(config.hdf5File))
+ val neighborDataset = hdfFile.getDatasetByPath("neighbors")
+ val neighborData = neighborDataset.getData()
+ var float2DDataNeighbor: Array[Array[Int]] = null
+ neighborData match {
+ case data: Array[Array[Int]] =>
+ float2DDataNeighbor = data
+ case _ =>
+ println("invalid neighbor data")
+ }
+ // the smaller the Hamming distance,the greater the similarity
+ val calculateHammingDistanceUDF = udf((trainLSH: Seq[Long], testLSH: Seq[Long]) => {
+ require(trainLSH.length == testLSH.length, "The input sequences must have the same length")
+ trainLSH.zip(testLSH).map { case (train, test) =>
+ java.lang.Long.bitCount(train ^ test)
+ }.sum
+ })
+ // the smaller the Euclidean distance,the greater the similarity
+ val calculateEuclideanDistanceUDF = udf((trainEmbedding: Seq[Float], testEmbedding: Seq[Float]) => {
+ require(testEmbedding.length == trainEmbedding.length, "The input sequences must have the same length")
+ sqrt(trainEmbedding.zip(testEmbedding).map { case (train, test) =>
+ pow(train - test, 2)
+ }.sum)
+ })
+ //the greater the Cosine distance,the greater the similarity
+ val calculateCosineDistanceUDF = udf((trainEmbedding: Seq[Float], testEmbedding: Seq[Float]) => {
+ require(testEmbedding.length == trainEmbedding.length, "The input sequences must have the same length")
+ trainEmbedding.zip(testEmbedding).map { case (train, test) =>
+ train * test
+ }.sum / (sqrt(trainEmbedding.map { train => train * train }.sum) * sqrt(testEmbedding.map { test => test * test }.sum))
+ })
+ //the smaller the Jaccard distance,the greater the similarity
+ val calculateJaccardDistanceUDF = udf((trainEmbedding: Seq[Float], testEmbedding: Seq[Float]) => {
+ require(testEmbedding.length == trainEmbedding.length, "The input sequences must have the same length")
+ val anb = testEmbedding.intersect(trainEmbedding).distinct
+ val aub = testEmbedding.union(trainEmbedding).distinct
+ val jaccardCoefficient = anb.length.toDouble / aub.length
+ 1 - jaccardCoefficient
+ })
+ spark.udf.register("calculateHammingDistance", calculateHammingDistanceUDF)
+ spark.udf.register("calculateEuclideanDistance", calculateEuclideanDistanceUDF)
+ spark.udf.register("calculateCosineDistance", calculateCosineDistanceUDF)
+ spark.udf.register("calculateJaccardDistance", calculateJaccardDistanceUDF)
+
+ val startTime = System.nanoTime()
+ val result = spark.sql(
+ s"""
+ SELECT *
+ FROM (
+ SELECT
+ testData.IndexId AS indexIdTest,
+ trainData.IndexId AS indexIdTrain,
+ testData.Embedding as EmbeddingTest,
+ trainData.Embedding as EmbeddingTrain,
+ ROW_NUMBER() OVER (PARTITION BY testData.IndexId ORDER BY calculateHammingDistance(testData.Embedding_LSH, trainData.Embedding_LSH) asc) AS rank
+ FROM (SELECT * FROM ${config.tableName} WHERE Split = 'test' LIMIT ${config.queryLimit}) testData
+ CROSS JOIN (SELECT * FROM ${config.tableName} WHERE Split = 'train') trainData
+ ) ranked
+ WHERE rank <= ${config.nFactor * config.topk}
+ """)
+ result.createOrReplaceTempView("rank")
+ val reResult = spark.sql(
+ s"""
+ SELECT *
+ FROM (
+ SELECT
+ rank.indexIdTest,
+ rank.indexIDTrain,
+ ROW_NUMBER() OVER(PARTITION BY rank.indexIdTest ORDER BY calculateEuclideanDistance(rank.EmbeddingTest,rank.EmbeddingTrain) asc) AS reRank
+ FROM rank
+ ) reRanked
+ WHERE reRank <= ${config.topk}
+ """).groupBy("indexIdTest").agg(collect_list("indexIdTrain").alias("indexIdTrainList"))
+
+
+ val endTime = System.nanoTime()
+ val duration = (endTime - startTime).nanos
+ println(s"time for query n4topk ${config.nFactor} :${duration.toMillis} milliseconds")
+
+ val startTime2 = System.nanoTime()
+
+ val recallDF = reResult.select("indexIdTest", "indexIdTrainList").rdd.map(row => {
+ val indexIdTest = row.getAs[Int]("indexIdTest")
+ val indexIdTrainList = row.getAs[Seq[Int]]("indexIdTrainList").toArray
+ val updatedList = indexIdTrainList.map(_ - 1)
+ val count = float2DDataNeighbor(indexIdTest - 1).take(config.topk).count(updatedList.contains)
+ val recall = count.toDouble / config.topk
+ (recall, 1)
+ }).reduce((acc1, acc2) => {
+ (acc1._1 + acc2._1, acc1._2 + acc2._2)
+ })
+
+ val avgRecall = recallDF._1 / recallDF._2
+ println(s"recall rate = $avgRecall")
+
+ val endTime2 = System.nanoTime()
+ val duration2 = (endTime2 - startTime2).nanos
+ println(s"time for sort:${duration2.toMillis} milliseconds")
+
+ case None => println("命令解析失败")
+ }
+ }
+}
diff --git a/lakesoul-spark/src/main/scala/org/apache/spark/sql/lakesoul/LakeSoulOptions.scala b/lakesoul-spark/src/main/scala/org/apache/spark/sql/lakesoul/LakeSoulOptions.scala
index ccfdad905..9f71de83e 100644
--- a/lakesoul-spark/src/main/scala/org/apache/spark/sql/lakesoul/LakeSoulOptions.scala
+++ b/lakesoul-spark/src/main/scala/org/apache/spark/sql/lakesoul/LakeSoulOptions.scala
@@ -132,4 +132,10 @@ object LakeSoulOptions {
val SNAPSHOT_READ = "snapshot"
val INCREMENTAL_READ = "incremental"
}
+
+ object SchemaFieldMetadata {
+ val LSH_EMBEDDING_DIMENSION = "lsh_embedding_dimension"
+ val LSH_BIT_WIDTH = "lsh_bit_width"
+ val LSH_RNG_SEED = "lsh_rng_seed"
+ }
}
diff --git a/pom.xml b/pom.xml
index c8887eb60..23d4e3f99 100644
--- a/pom.xml
+++ b/pom.xml
@@ -51,7 +51,7 @@ SPDX-License-Identifier: Apache-2.0
- 2.6.0-SNAPSHOT
+ 2.6.2-SNAPSHOT
UTF-8
UTF-8
UTF-8