diff --git a/docker/lakesoul-ann-benchmark-env/README.md b/docker/lakesoul-ann-benchmark-env/README.md new file mode 100644 index 000000000..8d4783d5c --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/README.md @@ -0,0 +1,30 @@ +# LakeSoul ANN Benchmark Environment + +## 运行步骤 + +1. 编译测试类 +```bash +bash compile_test_class.sh +``` +这个脚本会编译所需的测试类文件。 + +2. 启动所有必需的容器 +```bash +bash start_all.sh +``` +此脚本将启动以下服务: +- PostgreSQL +- Spark +- MinIO + +3. 准备 ANN 测试表 +```bash +bash prepare_ann_table.sh +``` +此脚本将创建并填充用于 ANN 测试的表。 + +4. 运行查询测试 +```bash +bash run_query.sh +``` +此脚本将执行预定义的查询测试用例。 diff --git a/docker/lakesoul-ann-benchmark-env/compile_test_class.sh b/docker/lakesoul-ann-benchmark-env/compile_test_class.sh new file mode 100644 index 000000000..42af7d8bf --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/compile_test_class.sh @@ -0,0 +1,2 @@ +cd ../.. +JAVA_HOME=/usr/lib/jvm/java-1.11.0-openjdk-amd64 mvn package -DskipTests -Prelease-linux-x86-64 diff --git a/docker/lakesoul-ann-benchmark-env/conf/my.conf b/docker/lakesoul-ann-benchmark-env/conf/my.conf new file mode 100644 index 000000000..8a4f2b1ac --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/conf/my.conf @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +[mysqld] +pid-file = /var/run/mysqld/mysqld.pid +socket = /var/run/mysqld/mysqld.sock +datadir = /var/lib/mysql +secure-file-priv= /data/tpch/ +# Disabling symbolic-links is recommended to prevent assorted security risks +symbolic-links=0 + +# Custom config should go here +!includedir /etc/mysql/conf.d/ \ No newline at end of file diff --git a/docker/lakesoul-ann-benchmark-env/conf/properties b/docker/lakesoul-ann-benchmark-env/conf/properties new file mode 100644 index 000000000..dceb3d5d1 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/conf/properties @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# table_num 生成表数据量 +table_num=10 +# host mysql域名 +host=localhost +# user: mysql账号; password: mysql密码; port: mysql端口号; db: mysql数据库名; +user=root +password=root +port=3306 +db=test_cdc +timezone=UTC +# row_num随机向表中插入多少条数据 +row_num=500 +# delete_num随机删除多少条数据 +delete_num=50 diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/fairscheduler.xml.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/fairscheduler.xml.template new file mode 100644 index 000000000..385b2e772 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/fairscheduler.xml.template @@ -0,0 +1,31 @@ + + + + + + + FAIR + 1 + 2 + + + FIFO + 2 + 3 + + diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/log4j2.properties.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/log4j2.properties.template new file mode 100644 index 000000000..ab96e03ba --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/log4j2.properties.template @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +rootLogger.level = info +rootLogger.appenderRef.stdout.ref = console + +# In the pattern layout configuration below, we specify an explicit `%ex` conversion +# pattern for logging Throwables. If this was omitted, then (by default) Log4J would +# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional +# class packaging information. That extra information can sometimes add a substantial +# performance overhead, so we disable it in our default logging config. +# For more information, see SPARK-39361. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex + +# Set the default spark-shell/spark-sql log level to WARN. When running the +# spark-shell/spark-sql, the log level for these classes is used to overwrite +# the root logger's log level, so that the user can have different defaults +# for the shell and regular Spark apps. +logger.repl.name = org.apache.spark.repl.Main +logger.repl.level = warn + +logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver +logger.thriftserver.level = warn + +# Settings to quiet third party logs that are too verbose +logger.jetty1.name = org.sparkproject.jetty +logger.jetty1.level = warn +logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle +logger.jetty2.level = error +logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper +logger.replexprTyper.level = info +logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter +logger.replSparkILoopInterpreter.level = info +logger.parquet1.name = org.apache.parquet +logger.parquet1.level = error +logger.parquet2.name = parquet +logger.parquet2.level = error + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler +logger.RetryingHMSHandler.level = fatal +logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry +logger.FunctionRegistry.level = error + +# For deploying Spark ThriftServer +# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 +appender.console.filter.1.type = RegexFilter +appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* +appender.console.filter.1.onMatch = deny +appender.console.filter.1.onMismatch = neutral diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/metrics.properties.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/metrics.properties.template new file mode 100644 index 000000000..f52d33fd6 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/metrics.properties.template @@ -0,0 +1,210 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# syntax: [instance].sink|source.[name].[options]=[value] + +# This file configures Spark's internal metrics system. The metrics system is +# divided into instances which correspond to internal components. +# Each instance can be configured to report its metrics to one or more sinks. +# Accepted values for [instance] are "master", "worker", "executor", "driver", +# and "applications". A wildcard "*" can be used as an instance name, in +# which case all instances will inherit the supplied property. +# +# Within an instance, a "source" specifies a particular set of grouped metrics. +# there are two kinds of sources: +# 1. Spark internal sources, like MasterSource, WorkerSource, etc, which will +# collect a Spark component's internal state. Each instance is paired with a +# Spark source that is added automatically. +# 2. Common sources, like JvmSource, which will collect low level state. +# These can be added through configuration options and are then loaded +# using reflection. +# +# A "sink" specifies where metrics are delivered to. Each instance can be +# assigned one or more sinks. +# +# The sink|source field specifies whether the property relates to a sink or +# source. +# +# The [name] field specifies the name of source or sink. +# +# The [options] field is the specific property of this source or sink. The +# source or sink is responsible for parsing this property. +# +# Notes: +# 1. To add a new sink, set the "class" option to a fully qualified class +# name (see examples below). +# 2. Some sinks involve a polling period. The minimum allowed polling period +# is 1 second. +# 3. Wildcard properties can be overridden by more specific properties. +# For example, master.sink.console.period takes precedence over +# *.sink.console.period. +# 4. A metrics specific configuration +# "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be +# added to Java properties using -Dspark.metrics.conf=xxx if you want to +# customize metrics system. You can also put the file in ${SPARK_HOME}/conf +# and it will be loaded automatically. +# 5. The MetricsServlet sink is added by default as a sink in the master, +# worker and driver, and you can send HTTP requests to the "/metrics/json" +# endpoint to get a snapshot of all the registered metrics in JSON format. +# For master, requests to the "/metrics/master/json" and +# "/metrics/applications/json" endpoints can be sent separately to get +# metrics snapshots of the master instance and applications. This +# MetricsServlet does not have to be configured. +# 6. The metrics system can also be configured using Spark configuration +# parameters. The relevant parameter names are formed by adding the +# prefix "spark.metrics.conf." to the configuration entries detailed in +# this file (see examples below). + +## List of available common sources and their properties. + +# org.apache.spark.metrics.source.JvmSource +# Note: Currently, JvmSource is the only available common source. +# It can be added to an instance by setting the "class" option to its +# fully qualified class name (see examples below). + +## List of available sinks and their properties. + +# org.apache.spark.metrics.sink.ConsoleSink +# Name: Default: Description: +# period 10 Poll period +# unit seconds Unit of the poll period + +# org.apache.spark.metrics.sink.CSVSink +# Name: Default: Description: +# period 10 Poll period +# unit seconds Unit of the poll period +# directory /tmp Where to store CSV files + +# org.apache.spark.metrics.sink.GangliaSink +# Name: Default: Description: +# host NONE Hostname or multicast group of the Ganglia server, +# must be set +# port NONE Port of the Ganglia server(s), must be set +# period 10 Poll period +# unit seconds Unit of the poll period +# ttl 1 TTL of messages sent by Ganglia +# dmax 0 Lifetime in seconds of metrics (0 never expired) +# mode multicast Ganglia network mode ('unicast' or 'multicast') + +# org.apache.spark.metrics.sink.JmxSink + +# org.apache.spark.metrics.sink.MetricsServlet +# Name: Default: Description: +# path VARIES* Path prefix from the web server root +# sample false Whether to show entire set of samples for histograms +# ('false' or 'true') +# +# * Default path is /metrics/json for all instances except the master. The +# master has two paths: +# /metrics/applications/json # App information +# /metrics/master/json # Master information + +# org.apache.spark.metrics.sink.PrometheusServlet +# Name: Default: Description: +# path VARIES* Path prefix from the web server root +# +# * Default path is /metrics/prometheus for all instances except the master. The +# master has two paths: +# /metrics/applications/prometheus # App information +# /metrics/master/prometheus # Master information + +# org.apache.spark.metrics.sink.GraphiteSink +# Name: Default: Description: +# host NONE Hostname of the Graphite server, must be set +# port NONE Port of the Graphite server, must be set +# period 10 Poll period +# unit seconds Unit of the poll period +# prefix EMPTY STRING Prefix to prepend to every metric's name +# protocol tcp Protocol ("tcp" or "udp") to use +# regex NONE Optional filter to send only metrics matching this regex string + +# org.apache.spark.metrics.sink.StatsdSink +# Name: Default: Description: +# host 127.0.0.1 Hostname or IP of StatsD server +# port 8125 Port of StatsD server +# period 10 Poll period +# unit seconds Units of poll period +# prefix EMPTY STRING Prefix to prepend to metric name + +## Examples +# Enable JmxSink for all instances by class name +#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink + +# Enable ConsoleSink for all instances by class name +#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink + +# Enable StatsdSink for all instances by class name +#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink +#*.sink.statsd.prefix=spark + +# Polling period for the ConsoleSink +#*.sink.console.period=10 +# Unit of the polling period for the ConsoleSink +#*.sink.console.unit=seconds + +# Polling period for the ConsoleSink specific for the master instance +#master.sink.console.period=15 +# Unit of the polling period for the ConsoleSink specific for the master +# instance +#master.sink.console.unit=seconds + +# Enable CsvSink for all instances by class name +#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink + +# Polling period for the CsvSink +#*.sink.csv.period=1 +# Unit of the polling period for the CsvSink +#*.sink.csv.unit=minutes + +# Polling directory for CsvSink +#*.sink.csv.directory=/tmp/ + +# Polling period for the CsvSink specific for the worker instance +#worker.sink.csv.period=10 +# Unit of the polling period for the CsvSink specific for the worker instance +#worker.sink.csv.unit=minutes + +# Enable Slf4jSink for all instances by class name +#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink + +# Polling period for the Slf4JSink +#*.sink.slf4j.period=1 +# Unit of the polling period for the Slf4jSink +#*.sink.slf4j.unit=minutes + +# Example configuration for Graphite sink +#*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink +#*.sink.graphite.host= +#*.sink.graphite.port= +#*.sink.graphite.period=10 +#*.sink.graphite.unit=seconds +#*.sink.graphite.prefix= + +# Enable JvmSource for instance master, worker, driver and executor +#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +# Example configuration for PrometheusServlet +#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet +#*.sink.prometheusServlet.path=/metrics/prometheus +#master.sink.prometheusServlet.path=/metrics/master/prometheus +#applications.sink.prometheusServlet.path=/metrics/applications/prometheus diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf new file mode 100755 index 000000000..844709658 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf @@ -0,0 +1,26 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +spark.sql.extensions com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension +spark.sql.catalog.lakesoul org.apache.spark.sql.lakesoul.catalog.LakeSoulCatalog +# spark.sql.defaultCatalog lakesoul +# spark.eventLog.enabled true +# spark.sql.catalogImplementation in-memory diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf.template new file mode 100644 index 000000000..19cba6e71 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-defaults.conf.template @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-env.sh.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-env.sh.template new file mode 100755 index 000000000..e9491995e --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/spark-env.sh.template @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file is sourced when running various Spark programs. +# Copy it as spark-env.sh and edit that to configure Spark for your site. + +# Options read when launching programs locally with +# ./bin/run-example or ./bin/spark-submit +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program + +# Options read by executors and drivers running inside the cluster +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program +# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data +# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos + +# Options read in any mode +# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). +# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) + +# Options read in any cluster manager using HDFS +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files + +# Options read in YARN client/cluster mode +# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN + +# Options for the daemons used in the standalone deploy mode +# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname +# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master +# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") +# - SPARK_WORKER_CORES, to set the number of cores to use on this machine +# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) +# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker +# - SPARK_WORKER_DIR, to set the working directory of worker processes +# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") +# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). +# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") +# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") +# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") +# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons +# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers + +# Options for launcher +# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y") + +# Generic options for the daemons used in the standalone deploy mode +# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) +# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5. +# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) +# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) +# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) +# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. +# Options for native BLAS, like Intel MKL, OpenBLAS, and so on. +# You might get better performance to enable these options if using native BLAS (see SPARK-21305). +# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL +# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS + +# Options for beeline +# - SPARK_BEELINE_OPTS, to set config properties only for the beeline cli (e.g. "-Dx=y") +# - SPARK_BEELINE_MEMORY, Memory for beeline (e.g. 1000M, 2G) (Default: 1G) diff --git a/docker/lakesoul-ann-benchmark-env/data/spark-conf/workers.template b/docker/lakesoul-ann-benchmark-env/data/spark-conf/workers.template new file mode 100644 index 000000000..be42a6382 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/data/spark-conf/workers.template @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# A Spark Worker will be started on each of the machines listed below. +localhost \ No newline at end of file diff --git a/docker/lakesoul-ann-benchmark-env/docker-compose.env b/docker/lakesoul-ann-benchmark-env/docker-compose.env new file mode 100644 index 000000000..9485df04a --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/docker-compose.env @@ -0,0 +1,25 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +PG_PORT=29013 +MYSQL_PORT=29014 +MINIO_UI_PORT=29002 +MINIO_API_PORT=29001 +DORIS_QUERY_PORT=29031 +FLINK_JOBMANAGER_PORT=28081 +LAKESOUL_META_HOST=lakesoul-meta-pg \ No newline at end of file diff --git a/docker/lakesoul-ann-benchmark-env/docker-compose.yml b/docker/lakesoul-ann-benchmark-env/docker-compose.yml new file mode 100644 index 000000000..f687d044b --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/docker-compose.yml @@ -0,0 +1,87 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +version: '3' + +services: + postgres: + image: postgres:14.5 + container_name: lakesoul-ann-pg + hostname: ${LAKESOUL_META_HOST} + networks: + - lakesoul-ann + ports: + - "${PG_PORT}:5432" + restart: always + environment: + POSTGRES_PASSWORD: lakesoul_test + POSTGRES_USER: lakesoul_test + POSTGRES_DB: lakesoul_test + command: + - "postgres" + - "-c" + - "max_connections=4096" + - "-c" + - "default_transaction_isolation=serializable" + volumes: + - ./sql/meta_init.sql:/docker-entrypoint-initdb.d/meta_init.sql + - ./sql/meta_cleanup.sql:/meta_cleanup.sql + + minio: + image: bitnami/minio:latest + container_name: lakesoul-ann-minio + ports: + - ${MINIO_API_PORT}:9000 + - ${MINIO_UI_PORT}:9001 + environment: + MINIO_ROOT_USER: admin + MINIO_ROOT_PASSWORD: password + hostname: minio + networks: + - lakesoul-ann + command: /opt/bitnami/minio/bin/minio server /data --console-address ":9001" + + spark: + image: bitnami/spark:3.3.1 + container_name: lakesoul-ann-spark + privileged: true + user: root + volumes: + - ./packages/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT.jar:/opt/bitnami/spark/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT.jar + # - ./packages/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT-tests.jar:/opt/bitnami/spark/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT-tests.jar + - ./data/spark-conf:/opt/bitnami/spark/conf + - ./sql/prepare_ann_benchmark.sql:/opt/sql/prepare_ann_benchmark.sql + - ./data/embeddings:/data/embeddings + - ./python:/opt/bitnami/spark/python + depends_on: + - postgres + - minio + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + - LAKESOUL_PG_DRIVER=com.lakesoul.shaded.org.postgresql.Driver + - LAKESOUL_PG_URL=jdbc:postgresql://${LAKESOUL_META_HOST}:5432/lakesoul_test?stringtype=unspecified + - LAKESOUL_PG_USERNAME=lakesoul_test + - LAKESOUL_PG_PASSWORD=lakesoul_test + networks: + - lakesoul-ann + +networks: + lakesoul-ann: + ipam: + driver: default \ No newline at end of file diff --git a/docker/lakesoul-ann-benchmark-env/init.log b/docker/lakesoul-ann-benchmark-env/init.log new file mode 100644 index 000000000..5880c4827 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/init.log @@ -0,0 +1,339 @@ +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/05 05:55:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/05 05:56:08 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/05 05:56:08 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/05 05:56:16 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/05 05:56:16 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.19.0.7 +24/11/05 05:56:17 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1730786161919 +Time taken: 8.551 seconds +24/11/05 05:56:27 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 5.61 seconds +Time taken: 2.213 seconds +Time taken: 3.564 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/05 06:34:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/05 06:34:38 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/05 06:34:38 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/05 06:34:46 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/05 06:34:46 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.21.0.6 +24/11/05 06:34:46 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1730788472466 +Time taken: 5.694 seconds +24/11/05 06:34:54 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 2.316 seconds +Time taken: 1.887 seconds +Time taken: 3.603 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/05 06:38:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/05 06:38:53 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/05 06:38:53 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/05 06:39:08 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/05 06:39:08 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.25.0.7 +24/11/05 06:39:08 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1730788731559 +Time taken: 6.553 seconds +24/11/05 06:39:16 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 2.3 seconds +Time taken: 5.505 seconds +Time taken: 4.333 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/05 06:47:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/05 06:47:44 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/05 06:47:44 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/05 06:48:01 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/05 06:48:01 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.27.0.6 +24/11/05 06:48:01 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1730789262128 +Time taken: 6.079 seconds +24/11/05 06:48:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 2.323 seconds +Time taken: 2.136 seconds +Time taken: 3.613 seconds +^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[DSetting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D24/11/06 02:37:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D24/11/06 02:37:12 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +^[[D24/11/06 02:37:12 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D^[[D24/11/06 02:37:28 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/06 02:37:28 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.28.0.6 +24/11/06 02:37:28 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1730860628015 +Time taken: 4.296 seconds +24/11/06 02:37:33 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 2.318 seconds +Time taken: 2.348 seconds +Time taken: 3.657 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/06 02:55:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/06 02:55:37 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/06 02:55:37 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/06 02:55:47 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/06 02:55:47 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.29.0.6 +24/11/06 02:55:47 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1730861730302 +Time taken: 5.277 seconds +24/11/06 02:55:53 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 2.206 seconds +Time taken: 2.002 seconds +Time taken: 3.546 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/06 09:59:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/06 10:00:01 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/06 10:00:01 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/06 10:00:21 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/06 10:00:21 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.18.0.6 +24/11/06 10:00:21 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1730887194895 +Time taken: 3.746 seconds +24/11/06 10:00:26 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 2.08 seconds +Time taken: 2.016 seconds +Time taken: 3.352 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/07 06:44:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/07 06:45:03 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/07 06:45:03 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/07 06:45:06 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/07 06:45:06 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.18.0.6 +Spark master: local[*], Application Id: local-1730961900954 +Time taken: 3.595 seconds +Time taken: 0.112 seconds +Time taken: 2.173 seconds +24/11/07 06:45:12 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 6.16 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/08 05:28:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/08 05:28:18 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 05:28:18 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/08 05:28:23 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/08 05:28:23 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.176.6 +24/11/08 05:28:23 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731043693667 +24/11/08 05:28:29 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/08 05:28:29 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/08 05:28:29 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/08 05:28:29 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 05:28:29 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 6.226 seconds +24/11/08 05:28:30 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +Time taken: 0.068 seconds +Time taken: 0.998 seconds +24/11/08 05:28:31 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 2.009 seconds +Time taken: 6.05 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/08 05:38:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/08 05:38:26 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 05:38:26 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/08 05:38:39 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/08 05:38:39 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.192.6 +24/11/08 05:38:39 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731044301051 +24/11/08 05:38:42 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/08 05:38:42 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/08 05:38:42 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/08 05:38:42 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 05:38:42 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 3.719 seconds +24/11/08 05:38:43 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +Time taken: 0.048 seconds +Time taken: 1.003 seconds +24/11/08 05:38:44 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 1.875 seconds +Time taken: 6.24 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/08 05:47:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/08 05:47:45 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 05:47:45 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/08 05:48:03 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/08 05:48:03 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.208.7 +24/11/08 05:48:03 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731044862868 +24/11/08 05:48:07 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/08 05:48:07 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/08 05:48:08 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/08 05:48:08 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 05:48:08 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 4.625 seconds +24/11/08 05:48:08 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +Time taken: 0.057 seconds +Time taken: 1.021 seconds +24/11/08 05:48:09 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 1.912 seconds +Time taken: 6.344 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/08 05:53:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/08 05:53:16 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 05:53:16 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/08 05:53:36 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/08 05:53:36 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.224.6 +24/11/08 05:53:36 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731045194239 +24/11/08 05:53:39 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/08 05:53:39 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/08 05:53:40 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/08 05:53:40 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 05:53:40 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 3.942 seconds +24/11/08 05:53:40 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +Time taken: 0.051 seconds +Time taken: 1.083 seconds +24/11/08 05:53:41 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 1.848 seconds +Time taken: 6.239 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/08 05:59:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/08 05:59:13 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 05:59:13 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/08 05:59:20 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/08 05:59:20 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.240.7 +24/11/08 05:59:20 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731045549352 +24/11/08 05:59:26 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/08 05:59:26 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/08 05:59:26 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/08 05:59:26 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 05:59:26 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 7.186 seconds +24/11/08 05:59:28 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +Time taken: 0.056 seconds +Time taken: 1.013 seconds +24/11/08 05:59:29 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 2.023 seconds +Time taken: 6.129 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/08 06:18:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/08 06:18:51 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 06:18:51 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/08 06:19:09 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/08 06:19:09 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.18.0.6 +24/11/08 06:19:09 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731046729488 +24/11/08 06:19:12 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/08 06:19:12 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/08 06:19:12 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/08 06:19:12 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 06:19:12 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 3.749 seconds +24/11/08 06:19:13 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +Time taken: 0.049 seconds +Time taken: 1.044 seconds +24/11/08 06:19:14 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 1.862 seconds +Time taken: 6.531 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/08 06:48:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/08 06:48:35 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 06:48:35 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/08 06:48:48 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/08 06:48:48 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.23.0.6 +24/11/08 06:48:48 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731048511456 +24/11/08 06:48:52 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/08 06:48:52 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/08 06:48:52 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/08 06:48:52 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 06:48:52 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 4.072 seconds +24/11/08 06:48:53 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +Time taken: 0.056 seconds +Time taken: 0.989 seconds +24/11/08 06:48:54 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 1.902 seconds +Time taken: 6.396 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/08 07:04:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/08 07:04:26 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 07:04:26 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/08 07:04:40 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/08 07:04:40 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.27.0.7 +24/11/08 07:04:40 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731049462352 +24/11/08 07:04:44 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/08 07:04:44 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/08 07:04:44 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/08 07:04:44 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 07:04:44 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 3.962 seconds +24/11/08 07:04:45 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +24/11/08 07:04:45 WARN ObjectStore: Failed to get database lakesoul, returning NoSuchObjectException +Error in query: Database 'lakesoul' not found +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/08 07:20:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/08 07:20:40 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 07:20:40 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/08 07:20:49 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/08 07:20:49 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.29.0.6 +24/11/08 07:20:49 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731050436434 +24/11/08 07:20:54 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/08 07:20:55 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/08 07:20:55 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/08 07:20:55 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/08 07:20:55 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 7.189 seconds +24/11/08 07:20:57 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +Time taken: 0.048 seconds +Time taken: 1.01 seconds +24/11/08 07:20:58 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 1.993 seconds +Time taken: 6.337 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/13 10:15:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/13 10:15:36 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/13 10:15:36 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/13 10:15:50 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/13 10:15:50 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.31.0.6 +24/11/13 10:15:50 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731492931846 +24/11/13 10:15:53 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/13 10:15:53 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/13 10:15:54 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/13 10:15:54 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/13 10:15:54 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 3.95 seconds +24/11/13 10:15:54 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +Time taken: 0.048 seconds +Time taken: 1.047 seconds +24/11/13 10:15:56 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 2.033 seconds +Time taken: 5.933 seconds +Setting default log level to "WARN". +To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). +24/11/15 02:12:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +24/11/15 02:12:50 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/15 02:12:50 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +24/11/15 02:13:02 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0 +24/11/15 02:13:02 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@192.168.16.6 +24/11/15 02:13:02 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException +Spark master: local[*], Application Id: local-1731636766381 +24/11/15 02:13:06 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `default`.`customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. +24/11/15 02:13:06 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory. +24/11/15 02:13:06 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist +24/11/15 02:13:06 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist +24/11/15 02:13:06 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist +Time taken: 4.108 seconds +24/11/15 02:13:07 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException +Time taken: 0.053 seconds +Time taken: 1.004 seconds +24/11/15 02:13:08 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +Time taken: 1.95 seconds +Time taken: 6.336 seconds diff --git a/docker/lakesoul-ann-benchmark-env/prepare_ann_table.sh b/docker/lakesoul-ann-benchmark-env/prepare_ann_table.sh new file mode 100644 index 000000000..74cc6c2c4 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/prepare_ann_table.sh @@ -0,0 +1,15 @@ +echo "Start prepare data for LSH ANN benchmark..." + +docker exec -it lakesoul-ann-spark spark-submit \ + --class com.dmetasoul.lakesoul.spark.ann.LocalSensitiveHashANN \ + --conf spark.sql.extensions=com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension \ + --conf spark.hadoop.fs.s3.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ + --conf spark.hadoop.fs.s3a.buffer.dir=/opt/spark/work-dir/s3a \ + --conf spark.hadoop.fs.s3a.path.style.access=true \ + --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000 \ + --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ + --conf spark.hadoop.fs.s3a.access.key=admin \ + --conf spark.hadoop.fs.s3a.secret.key=password \ + --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \ + --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ + /opt/bitnami/spark/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT.jar load --hdf5-file /data/embeddings/fashion-mnist-784-euclidean.hdf5 --warehouse s3://lakesoul-test-bucket --table-name ann_table --embedding-dim 784 --bit-width 512 \ No newline at end of file diff --git a/docker/lakesoul-ann-benchmark-env/run_query.sh b/docker/lakesoul-ann-benchmark-env/run_query.sh new file mode 100644 index 000000000..5f95e53ec --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/run_query.sh @@ -0,0 +1,15 @@ +echo "Start query data for LSH ANN benchmark..." + +docker exec -it lakesoul-ann-spark spark-submit \ + --class com.dmetasoul.lakesoul.spark.ann.LocalSensitiveHashANN \ + --conf spark.sql.extensions=com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension \ + --conf spark.hadoop.fs.s3.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ + --conf spark.hadoop.fs.s3a.buffer.dir=/opt/spark/work-dir/s3a \ + --conf spark.hadoop.fs.s3a.path.style.access=true \ + --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000 \ + --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \ + --conf spark.hadoop.fs.s3a.access.key=admin \ + --conf spark.hadoop.fs.s3a.secret.key=password \ + --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \ + --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ + /opt/bitnami/spark/jars/lakesoul-spark-3.3-2.6.2-SNAPSHOT.jar query --hdf5-file /data/embeddings/fashion-mnist-784-euclidean.hdf5 --table-name ann_table --query-limit 100 --topk 100 --n-factor 3 \ No newline at end of file diff --git a/docker/lakesoul-ann-benchmark-env/sql/meta_cleanup.sql b/docker/lakesoul-ann-benchmark-env/sql/meta_cleanup.sql new file mode 100644 index 000000000..dd25a7ee9 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/sql/meta_cleanup.sql @@ -0,0 +1,11 @@ +-- SPDX-FileCopyrightText: 2023 LakeSoul Contributors +-- +-- SPDX-License-Identifier: Apache-2.0 + +delete from namespace; +insert into namespace(namespace, properties, comment) values ('default', '{}', ''); +delete from data_commit_info; +delete from table_info; +delete from table_path_id; +delete from table_name_id; +delete from partition_info; diff --git a/docker/lakesoul-ann-benchmark-env/sql/meta_init.sql b/docker/lakesoul-ann-benchmark-env/sql/meta_init.sql new file mode 100644 index 000000000..28b4eca4b --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/sql/meta_init.sql @@ -0,0 +1,145 @@ +-- SPDX-FileCopyrightText: 2023 LakeSoul Contributors +-- +-- SPDX-License-Identifier: Apache-2.0 + +create table if not exists namespace +( + namespace text, + properties json, + comment text, + domain text default 'public', + primary key (namespace) +); + +insert into namespace(namespace, properties, comment) values ('default', '{}', '') +ON CONFLICT DO NOTHING; + +create table if not exists table_info +( + table_id text, + table_namespace text default 'default', + table_name text, + table_path text, + table_schema text, + properties json, + partitions text, + domain text default 'public', + primary key (table_id) +); + +create table if not exists table_name_id +( + table_name text, + table_id text, + table_namespace text default 'default', + domain text default 'public', + primary key (table_name, table_namespace) +); + +create table if not exists table_path_id +( + table_path text, + table_id text, + table_namespace text default 'default', + domain text default 'public', + primary key (table_path) +); + +DO +$$ + BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'data_file_op') THEN + create type data_file_op as + ( + path text, + file_op text, + size bigint, + file_exist_cols text + ); + END IF; + END +$$; + +create table if not exists data_commit_info +( + table_id text, + partition_desc text, + commit_id UUID, + file_ops data_file_op[], + commit_op text, + committed boolean default 'false', + timestamp bigint, + domain text default 'public', + primary key (table_id, partition_desc, commit_id) +); + +create table if not exists partition_info +( + table_id text, + partition_desc text, + version int, + commit_op text, + timestamp bigint DEFAULT (date_part('epoch'::text, now()) * (1000)::double precision), + snapshot UUID[], + expression text, + domain text default 'public', + primary key (table_id, partition_desc, version) +); + +CREATE OR REPLACE FUNCTION partition_insert() RETURNS TRIGGER AS +$$ +DECLARE + rs_version integer; + rs_table_path text; + rs_table_namespace text; +BEGIN + if NEW.commit_op <> 'CompactionCommit' then + select version + INTO rs_version + from partition_info + where table_id = NEW.table_id + and partition_desc = NEW.partition_desc + and version != NEW.version + and commit_op = 'CompactionCommit' + order by version desc + limit 1; + if rs_version >= 0 then + if NEW.version - rs_version >= 3 then + select table_path, table_namespace + into rs_table_path, rs_table_namespace + from table_info + where table_id = NEW.table_id; + perform pg_notify('lakesoul_compaction_notify', + concat('{"table_path":"', rs_table_path, '","table_partition_desc":"', + NEW.partition_desc, '","table_namespace":"', rs_table_namespace, '"}')); + end if; + else + if NEW.version >= 2 then + select table_path, table_namespace + into rs_table_path, rs_table_namespace + from table_info + where table_id = NEW.table_id; + perform pg_notify('lakesoul_compaction_notify', + concat('{"table_path":"', rs_table_path, '","table_partition_desc":"', + NEW.partition_desc, '","table_namespace":"', rs_table_namespace, '"}')); + end if; + end if; + RETURN NULL; + end if; + RETURN NULL; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE TRIGGER partition_table_change + AFTER INSERT + ON partition_info + FOR EACH ROW +EXECUTE PROCEDURE partition_insert(); + +create table if not exists global_config +( + key text, + value text, + primary key (key) +); + diff --git a/docker/lakesoul-ann-benchmark-env/start_all.sh b/docker/lakesoul-ann-benchmark-env/start_all.sh new file mode 100644 index 000000000..b3dbc197e --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/start_all.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash + +set -e + +LAKESOUL_VERSION=2.6.2-SNAPSHOT +SPARK_LAKESOUL_JAR=lakesoul-spark-3.3-${LAKESOUL_VERSION}.jar +SPARK_LAKESOUL_TEST_JAR=lakesoul-spark-3.3-${LAKESOUL_VERSION}-tests.jar + +download_hdf5_file() { + local FILE_NAME=$1 + local EXPECTED_MD5=$2 + local DOWNLOAD_URL=$3 + + if [[ -f ${FILE_NAME} ]]; then + local FILE_MD5 + echo "compare md5 of ${FILE_NAME} with ${EXPECTED_MD5}" + FILE_MD5=$(md5sum ${FILE_NAME} | awk '{print $1}') + if [[ "${FILE_MD5}" != "${EXPECTED_MD5}" ]]; then + echo "md5 not match, expected: ${EXPECTED_MD5}, actual: ${FILE_MD5}" + rm "${FILE_NAME}" + wget "${DOWNLOAD_URL}/${FILE_NAME}" + fi + else + echo "download ${FILE_NAME} " + wget "${DOWNLOAD_URL}/${FILE_NAME}" + fi +} + +mkdir -p data/embeddings + +download_hdf5_file "fashion-mnist-784-euclidean.hdf5" "23249362dbd58a321c05b1a85275adba" "http://ann-benchmarks.com" +cp fashion-mnist-784-euclidean.hdf5 data/embeddings/ + +# echo "Prepare python environment..." +# if [ ! -d ".venv" ]; then +# python -m venv .venv +# source .venv/bin/activate +# # 在本地安装必要的Python包 +# pip install -r requirements.txt +# else +# source .venv/bin/activate +# fi + +mkdir -p packages/jars + +cp ../../lakesoul-spark/target/${SPARK_LAKESOUL_JAR} packages/jars/ +cp ../../lakesoul-spark/target/${SPARK_LAKESOUL_TEST_JAR} packages/jars/ + + +echo "Start docker-compose..." +docker compose -f docker-compose.yml --env-file docker-compose.env up -d + +echo "Waiting for MinIO to be ready..." +sleep 10 # 给 MinIO 一些启动时间 + +# 创建 bucket +docker exec lakesoul-ann-minio mc alias set local http://localhost:9000 admin password +docker exec lakesoul-ann-minio mc mb local/lakesoul-test-bucket +docker exec lakesoul-ann-minio mc policy set public local/lakesoul-test-bucket + + +# 首先将 Python 脚本和虚拟环境复制到容器中 +# docker cp .venv/lib/python3.11/site-packages lakesoul-ann-spark:/opt/bitnami/python/lib/python3.8 + +# bash prepare_ann_table.sh + +echo "=============================================================================" +echo "Success to launch LSH ANN benchmark environment!" +echo "You can:" +echo " 'bash start_spark_sql.sh' to login into spark" +echo "=============================================================================" + diff --git a/docker/lakesoul-ann-benchmark-env/start_spark_sql.sh b/docker/lakesoul-ann-benchmark-env/start_spark_sql.sh new file mode 100644 index 000000000..e7e7e5487 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/start_spark_sql.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sudo docker exec -it lakesoul-ann-spark spark-sql \ + --conf spark.sql.extensions=com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension \ + --conf spark.hadoop.fs.s3.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ + --conf spark.hadoop.fs.s3a.buffer.dir=/opt/spark/work-dir/s3a \ + --conf spark.hadoop.fs.s3a.path.style.access=true \ + --conf spark.hadoop.fs.s3a.endpoint=http://minio:9000 \ + --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider \ + --conf spark.hadoop.fs.s3a.access.key=admin \ + --conf spark.hadoop.fs.s3a.secret.key=password \ No newline at end of file diff --git a/docker/lakesoul-ann-benchmark-env/stop_all.sh b/docker/lakesoul-ann-benchmark-env/stop_all.sh new file mode 100644 index 000000000..63c244756 --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/stop_all.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sudo docker compose -f docker-compose.yml --env-file docker-compose.env down diff --git a/docker/lakesoul-ann-benchmark-env/work-dir/lakesoul.properties b/docker/lakesoul-ann-benchmark-env/work-dir/lakesoul.properties new file mode 100644 index 000000000..a37fffa1a --- /dev/null +++ b/docker/lakesoul-ann-benchmark-env/work-dir/lakesoul.properties @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +lakesoul.pg.driver=com.lakesoul.shaded.org.postgresql.Driver +lakesoul.pg.url=jdbc:postgresql://doris-lakesoul-docker-compose-env-lakesoul-meta-db-1:5432/lakesoul_test?stringtype=unspecified +lakesoul.pg.username=lakesoul_test +lakesoul.pg.password=lakesoul_test diff --git a/lakesoul-spark/pom.xml b/lakesoul-spark/pom.xml index f880dd64d..ecb52d619 100644 --- a/lakesoul-spark/pom.xml +++ b/lakesoul-spark/pom.xml @@ -513,6 +513,7 @@ SPDX-License-Identifier: Apache-2.0 org.postgresql:postgresql com.alibaba:fastjson mysql:mysql-connector-java + io.jhdf:jhdf org.casbin:jdbc-adapter diff --git a/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ParametersTool.java b/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ParametersTool.java index 38ab69faf..c0c27e1fc 100644 --- a/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ParametersTool.java +++ b/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ParametersTool.java @@ -364,4 +364,13 @@ protected void addToDefaults(String key, String value) { } } + + @Override + public String toString() { + return "ParametersTool{" + + "data=" + data + + ", defaultData=" + defaultData + + ", unrequestedParameters=" + unrequestedParameters + + '}'; + } } \ No newline at end of file diff --git a/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ann/LocalSensitiveHashANN.scala b/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ann/LocalSensitiveHashANN.scala new file mode 100644 index 000000000..4664f24a9 --- /dev/null +++ b/lakesoul-spark/src/main/scala/com/dmetasoul/lakesoul/spark/ann/LocalSensitiveHashANN.scala @@ -0,0 +1,328 @@ +package com.dmetasoul.lakesoul.spark.ann + +import com.dmetasoul.lakesoul.spark.ParametersTool +import com.dmetasoul.lakesoul.tables.LakeSoulTable +import io.jhdf.HdfFile +import org.apache.spark.sql.functions.{collect_list, udf} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.lakesoul.LakeSoulOptions +import org.apache.spark.sql.lakesoul.catalog.LakeSoulCatalog +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Row, SparkSession} + +import java.nio.file.Paths +import scala.concurrent.duration.DurationLong +import scala.math.{pow, sqrt} + +object LocalSensitiveHashANN { + + case class LoadConfig( + hdf5File: String = "", + warehouse: String = "", + tableName: String = "ann_table", + embeddingDim: Int = 0, + bitWidth: Int = 512, + ) + + case class QueryConfig( + hdf5File: String = "", + tableName: String = "", + queryLimit: Int = 100, + topk: Int = 100, + nFactor: Int = 3 + ) + + sealed trait Command + case class Load(config: LoadConfig) extends Command + case class Query(config: QueryConfig) extends Command + + def parseArgs(args: Array[String]): Option[Command] = { + if (args.isEmpty) { + printUsage() + return None + } + + args(0).toLowerCase match { + case "load" => parseLoadCommand(args.tail) + case "query" => parseQueryCommand(args.tail) + case _ => + println(s"未知命令: ${args(0)}") + printUsage() + None + } + } + + private def parseLoadCommand(args: Array[String]): Option[Command] = { + var config = LoadConfig() + var i = 0 + while (i < args.length) { + args(i) match { + case "--hdf5-file" if i + 1 < args.length => + config = config.copy(hdf5File = args(i + 1)) + i += 2 + case "--warehouse" if i + 1 < args.length => + config = config.copy(warehouse = args(i + 1)) + i += 2 + case "--table-name" if i + 1 < args.length => + config = config.copy(tableName = args(i + 1)) + i += 2 + case "--embedding-dim" if i + 1 < args.length => + config = config.copy(embeddingDim = args(i + 1).toInt) + i += 2 + case "--bit-width" if i + 1 < args.length => + config = config.copy(bitWidth = args(i + 1).toInt) + i += 2 + case arg => + println(s"未知的load参数: $arg") + printUsage() + return None + } + } + + if (config.hdf5File.isEmpty || config.warehouse.isEmpty || config.embeddingDim <= 0) { + println("缺少必需参数: --hdf5-file 和 --warehouse 和 --embedding-dim") + printUsage() + None + } else { + Some(Load(config)) + } + } + + private def parseQueryCommand(args: Array[String]): Option[Command] = { + var config = QueryConfig() + var i = 0 + while (i < args.length) { + args(i) match { + case "--hdf5-file" if i + 1 < args.length => + config = config.copy(hdf5File = args(i + 1)) + i += 2 + case "--table-name" if i + 1 < args.length => + config = config.copy(tableName = args(i + 1)) + i += 2 + case "--query-limit" if i + 1 < args.length => + config = config.copy(queryLimit = args(i + 1).toInt) + i += 2 + case "--topk" if i + 1 < args.length => + config = config.copy(topk = args(i + 1).toInt) + i += 2 + case "--n-factor" if i + 1 < args.length => + config = config.copy(nFactor = args(i + 1).toInt) + i += 2 + case arg => + println(s"未知的query参数: $arg") + printUsage() + return None + } + } + + if (config.tableName.isEmpty) { + println("缺少必需参数: --table-name") + printUsage() + None + } else { + Some(Query(config)) + } + } + + private def printUsage(): Unit = { + println( + """用法: + | load命令: + | load --hdf5-file --warehouse [--table-name ] + | + | query命令: + | query --hdf5-file --table-name [--query-limit ] [--topk ] [--n-factor ] + | + |参数说明: + | load命令参数: + | --hdf5-file : HDF5文件路径(必需) + | --warehouse : LakeSoul仓库路径(必需) + | --table-name : 表名(可选,默认:ann_table) + | --embedding-dim: 嵌入维度(必需,要求与HDF5文件中的embedding维度一致) + | --bit-width : 位宽(可选,默认:512) + | + | query命令参数: + | --hdf5-file : HDF5文件路径(必需) + | --table-name : 表名(必需) + | --query-limit : 查询数据量(可选,默认:100) + | --topk : 返回最近邻的数量(可选,默认:100) + | --n-factor : LSH第一阶段过滤倍数(可选,默认:3) + |""".stripMargin) + } + + + def main(args: Array[String]): Unit = { + val builder = SparkSession.builder() + .appName("LocalSensitiveHashANN") + .master("local[1]") + .config("spark.sql.parquet.mergeSchema", value = true) + .config("spark.sql.parquet.filterPushdown", value = true) + .config("spark.sql.extensions", "com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension") + .config("spark.sql.catalog.lakesoul", classOf[LakeSoulCatalog].getName) + .config(SQLConf.DEFAULT_CATALOG.key, LakeSoulCatalog.CATALOG_NAME) + + val spark = builder.getOrCreate() + spark.sparkContext.setLogLevel("WARN") + + val command = parseArgs(args) + command match { + case Some(Load(config)) => + val hdfFile = new HdfFile(Paths.get(config.hdf5File)) + + val trainDataset = hdfFile.getDatasetByPath("train") + val testDataset = hdfFile.getDatasetByPath("test") + val trainData = trainDataset.getData() + val testData = testDataset.getData() + + val schema = StructType(Array( + StructField("IndexId", IntegerType, true), + StructField("Embedding", ArrayType(FloatType), true, new MetadataBuilder() + .putString(LakeSoulOptions.SchemaFieldMetadata.LSH_EMBEDDING_DIMENSION, config.embeddingDim.toString) + .putString(LakeSoulOptions.SchemaFieldMetadata.LSH_BIT_WIDTH, config.bitWidth.toString) + .putString(LakeSoulOptions.SchemaFieldMetadata.LSH_RNG_SEED, "1234567890").build()), + StructField("Embedding_LSH", ArrayType(LongType), true), + StructField("Split", StringType, true) + )) + trainData match { + case float2DData: Array[Array[Float]] => + val classIds = (1 to float2DData.length).toArray + + val rows = float2DData.zip(classIds).map { + case (embedding, indexId) => + // Row(indexId, embedding) + Row(indexId, embedding, null, "train") + } + val df = spark.createDataFrame(spark.sparkContext.parallelize(rows), schema) + df.write.format("lakesoul") + .option("hashPartitions", "IndexId") + .option("hashBucketNum", 4) + .option(LakeSoulOptions.SHORT_TABLE_NAME, config.tableName) + .mode("Overwrite").save(s"${config.warehouse}/${config.tableName}") + case _ => + println("Invalid train data") + } + + + testData match { + case float2DTestData: Array[Array[Float]] => + val classIdsTest = (1 to float2DTestData.length).toArray + + val rowsTest = float2DTestData.zip(classIdsTest).map { + case (embedding, indexId) => + // Row(indexId, embedding) + Row(indexId, embedding, null, "test") + } + + // val num = 50 + val dfTest = spark.createDataFrame(spark.sparkContext.parallelize(rowsTest), schema) + LakeSoulTable.forPath(s"${config.warehouse}/${config.tableName}").upsert(dfTest) + + case _ => + println("Invalid test data") + } + + + case Some(Query(config)) => + val hdfFile = new HdfFile(Paths.get(config.hdf5File)) + val neighborDataset = hdfFile.getDatasetByPath("neighbors") + val neighborData = neighborDataset.getData() + var float2DDataNeighbor: Array[Array[Int]] = null + neighborData match { + case data: Array[Array[Int]] => + float2DDataNeighbor = data + case _ => + println("invalid neighbor data") + } + // the smaller the Hamming distance,the greater the similarity + val calculateHammingDistanceUDF = udf((trainLSH: Seq[Long], testLSH: Seq[Long]) => { + require(trainLSH.length == testLSH.length, "The input sequences must have the same length") + trainLSH.zip(testLSH).map { case (train, test) => + java.lang.Long.bitCount(train ^ test) + }.sum + }) + // the smaller the Euclidean distance,the greater the similarity + val calculateEuclideanDistanceUDF = udf((trainEmbedding: Seq[Float], testEmbedding: Seq[Float]) => { + require(testEmbedding.length == trainEmbedding.length, "The input sequences must have the same length") + sqrt(trainEmbedding.zip(testEmbedding).map { case (train, test) => + pow(train - test, 2) + }.sum) + }) + //the greater the Cosine distance,the greater the similarity + val calculateCosineDistanceUDF = udf((trainEmbedding: Seq[Float], testEmbedding: Seq[Float]) => { + require(testEmbedding.length == trainEmbedding.length, "The input sequences must have the same length") + trainEmbedding.zip(testEmbedding).map { case (train, test) => + train * test + }.sum / (sqrt(trainEmbedding.map { train => train * train }.sum) * sqrt(testEmbedding.map { test => test * test }.sum)) + }) + //the smaller the Jaccard distance,the greater the similarity + val calculateJaccardDistanceUDF = udf((trainEmbedding: Seq[Float], testEmbedding: Seq[Float]) => { + require(testEmbedding.length == trainEmbedding.length, "The input sequences must have the same length") + val anb = testEmbedding.intersect(trainEmbedding).distinct + val aub = testEmbedding.union(trainEmbedding).distinct + val jaccardCoefficient = anb.length.toDouble / aub.length + 1 - jaccardCoefficient + }) + spark.udf.register("calculateHammingDistance", calculateHammingDistanceUDF) + spark.udf.register("calculateEuclideanDistance", calculateEuclideanDistanceUDF) + spark.udf.register("calculateCosineDistance", calculateCosineDistanceUDF) + spark.udf.register("calculateJaccardDistance", calculateJaccardDistanceUDF) + + val startTime = System.nanoTime() + val result = spark.sql( + s""" + SELECT * + FROM ( + SELECT + testData.IndexId AS indexIdTest, + trainData.IndexId AS indexIdTrain, + testData.Embedding as EmbeddingTest, + trainData.Embedding as EmbeddingTrain, + ROW_NUMBER() OVER (PARTITION BY testData.IndexId ORDER BY calculateHammingDistance(testData.Embedding_LSH, trainData.Embedding_LSH) asc) AS rank + FROM (SELECT * FROM ${config.tableName} WHERE Split = 'test' LIMIT ${config.queryLimit}) testData + CROSS JOIN (SELECT * FROM ${config.tableName} WHERE Split = 'train') trainData + ) ranked + WHERE rank <= ${config.nFactor * config.topk} + """) + result.createOrReplaceTempView("rank") + val reResult = spark.sql( + s""" + SELECT * + FROM ( + SELECT + rank.indexIdTest, + rank.indexIDTrain, + ROW_NUMBER() OVER(PARTITION BY rank.indexIdTest ORDER BY calculateEuclideanDistance(rank.EmbeddingTest,rank.EmbeddingTrain) asc) AS reRank + FROM rank + ) reRanked + WHERE reRank <= ${config.topk} + """).groupBy("indexIdTest").agg(collect_list("indexIdTrain").alias("indexIdTrainList")) + + + val endTime = System.nanoTime() + val duration = (endTime - startTime).nanos + println(s"time for query n4topk ${config.nFactor} :${duration.toMillis} milliseconds") + + val startTime2 = System.nanoTime() + + val recallDF = reResult.select("indexIdTest", "indexIdTrainList").rdd.map(row => { + val indexIdTest = row.getAs[Int]("indexIdTest") + val indexIdTrainList = row.getAs[Seq[Int]]("indexIdTrainList").toArray + val updatedList = indexIdTrainList.map(_ - 1) + val count = float2DDataNeighbor(indexIdTest - 1).take(config.topk).count(updatedList.contains) + val recall = count.toDouble / config.topk + (recall, 1) + }).reduce((acc1, acc2) => { + (acc1._1 + acc2._1, acc1._2 + acc2._2) + }) + + val avgRecall = recallDF._1 / recallDF._2 + println(s"recall rate = $avgRecall") + + val endTime2 = System.nanoTime() + val duration2 = (endTime2 - startTime2).nanos + println(s"time for sort:${duration2.toMillis} milliseconds") + + case None => println("命令解析失败") + } + } +} diff --git a/lakesoul-spark/src/main/scala/org/apache/spark/sql/lakesoul/LakeSoulOptions.scala b/lakesoul-spark/src/main/scala/org/apache/spark/sql/lakesoul/LakeSoulOptions.scala index ccfdad905..9f71de83e 100644 --- a/lakesoul-spark/src/main/scala/org/apache/spark/sql/lakesoul/LakeSoulOptions.scala +++ b/lakesoul-spark/src/main/scala/org/apache/spark/sql/lakesoul/LakeSoulOptions.scala @@ -132,4 +132,10 @@ object LakeSoulOptions { val SNAPSHOT_READ = "snapshot" val INCREMENTAL_READ = "incremental" } + + object SchemaFieldMetadata { + val LSH_EMBEDDING_DIMENSION = "lsh_embedding_dimension" + val LSH_BIT_WIDTH = "lsh_bit_width" + val LSH_RNG_SEED = "lsh_rng_seed" + } } diff --git a/pom.xml b/pom.xml index c8887eb60..23d4e3f99 100644 --- a/pom.xml +++ b/pom.xml @@ -51,7 +51,7 @@ SPDX-License-Identifier: Apache-2.0 - 2.6.0-SNAPSHOT + 2.6.2-SNAPSHOT UTF-8 UTF-8 UTF-8