diff --git a/.gitmodules b/.gitmodules index 5f7212dce..a24788cb8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,12 +1,12 @@ -[submodule "submodules/Catch2"] - path = submodules/Catch2 +[submodule "components/core/submodules/Catch2"] + path = components/core/submodules/Catch2 url = https://github.com/catchorg/Catch2.git -[submodule "submodules/date"] - path = submodules/date +[submodule "components/core/submodules/date"] + path = components/core/submodules/date url = https://github.com/HowardHinnant/date.git -[submodule "submodules/yaml-cpp"] - path = submodules/yaml-cpp +[submodule "components/core/submodules/yaml-cpp"] + path = components/core/submodules/yaml-cpp url = https://github.com/jbeder/yaml-cpp.git -[submodule "submodules/json"] - path = submodules/json +[submodule "components/core/submodules/json"] + path = components/core/submodules/json url = https://github.com/nlohmann/json.git diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..7a4a3ea24 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md index f913393d2..9a4719c0e 100644 --- a/README.md +++ b/README.md @@ -1,183 +1,29 @@ # CLP -Compressed Log Processor (CLP) is a tool capable of losslessly compressing text logs and searching the compressed logs without decompression. -To learn more about it, you can read our [paper](https://www.usenix.org/system/files/osdi21-rodrigues.pdf). - -🔔 clp-core is part of a larger CLP package that can be built from [clp-packager](https://github.com/y-scope/clp-packager). - -## Contents - -* [Getting Started](#getting-started) -* [Requirements](#requirements) -* [Building](#building) - * [Source Dependencies](#source-dependencies) - * [Packages](#packages) - * [Libraries](#libraries) - * [Build](#build) -* [Running](#running) - * [`clp`](#clp) - * [`clg`](#clg) -* [Parallel Compression](#parallel-compression) -* [Next Steps](#next-steps) - +Compressed Log Processor (CLP) is a tool capable of losslessly compressing text logs and searching +the compressed logs without decompression. To learn more about it, you can read our +[paper](https://www.usenix.org/system/files/osdi21-rodrigues.pdf). ## Getting Started -CLP is currently released as source, so you'll need to build it before running it. - -## Requirements - -* We have built and tested CLP on **Ubuntu 18.04 (bionic)** and **Ubuntu 20.04 (focal)**. - * If you have trouble building for another OS, file an issue and we may be able to help. -* A compiler that supports c++14 - -## Building - -* To build, we require some source dependencies, packages from package managers, and libraries built from source. - -### Source Dependencies - -We use both git submodules and third-party source packages. To download all, you can run this script: -```shell -tools/scripts/deps-download/download-all.sh -``` - -This will download: -* [Catch2](https://github.com/catchorg/Catch2.git) (v2.13.6) -* [date](https://github.com/HowardHinnant/date.git) (v3.0.1) -* [json](https://github.com/nlohmann/json.git) (v3.10.2) -* [SQLite3](https://www.sqlite.org/download.html) (v3.36.0) -* [yaml-cpp](https://github.com/jbeder/yaml-cpp.git) (v0.7.0) - -### Packages - -If you're using apt-get, you can use the following command to install all: -```shell -sudo apt-get install -y ca-certificates checkinstall cmake build-essential \ -libboost-filesystem-dev libboost-iostreams-dev libboost-program-options-dev \ -libssl-dev pkg-config rsync wget zlib1g-dev -``` - -This will download: -* ca-certificates -* checkinstall -* cmake -* build-essential -* libboost-filesystem-dev -* libboost-iostreams-dev -* libboost-program-options-dev -* libssl-dev -* pkg-config -* rsync -* wget -* zlib1g-dev - -### Libraries - -The latest versions of some packages are not offered by apt repositories, -so we've included some scripts to download, compile, and install them: -```shell -./tools/scripts/lib_install/fmtlib.sh 8.0.1 -./tools/scripts/lib_install/libarchive.sh 3.5.1 -./tools/scripts/lib_install/lz4.sh 1.8.2 -./tools/scripts/lib_install/mariadb-connector-c.sh 3.2.3 -./tools/scripts/lib_install/spdlog.sh 1.9.2 -./tools/scripts/lib_install/zstandard.sh 1.4.9 -``` - -### Build - -* Configure the cmake project: - ```shell - mkdir build - cd build - cmake ../ - ``` - -* Build: - ```shell - make - ``` - -## Running - -* CLP contains two executables: `clp` and `clg` - * `clp` is used for compressing and extracting logs - * `clg` is used for performing wildcard searches on the compressed logs - -### `clp` - -To compress some logs: -```shell -./clp c archives-dir /home/my/logs -``` -* `archives-dir` is where compressed logs should be output - * `clp` will create a number of files and directories within, so it's best if this directory is empty - * You can use the same directory repeatedly and `clp` will add to the compressed logs within. -* `/home/my/logs` is any log file or directory containing log files - -To decompress those logs: -```shell -./clp x archive-dir decompressed -``` -* `archives-dir` is where the compressed logs were previously stored -* `decompressed` is a directory where they will be decompressed to - -You can also decompress a specific file: -```shell -./clp x archive-dir decompressed /my/file/path.log -``` -* `/my/file/path.log` is the uncompressed file's path (the one that was passed to `clp` for compression) - -More usage instructions can be found by running: -```shell -./clp --help -``` - -### `clg` - -To search the compressed logs: -```shell -./clg archives-dir " a *wildcard* search phrase " -``` -* `archives-dir` is where the compressed logs were previously stored -* The search phrase can contain the `*` wildcard which matches 0 or more characters, or the `?` wildcard which matches any single character. - -Similar to `clp`, `clg` can search a single file: -```shell -./clg archives-dir " a *wildcard* search phrase " /my/file/path.log -``` -* `/my/file/path.log` is the uncompressed file's path (the one that was passed to `clp` for compression) - -More usage instructions can be found by running: -```shell -./clg --help -``` - -## Parallel Compression - -By default, `clp` uses an embedded SQLite database, so each directory containing archives can only -be accessed by a single `clp` instance. +You can download a release from the [releases](TODO) page or you can build the latest by using the +[packager](tools/packager/README.md). -To enable parallel compression to the same archives directory, `clp`/`clg` can be configured to -use a MySQL-type database (MariaDB) as follows: +## Project Structure -* Install and configure MariaDB using the instructions for your platform -* Create a user that has privileges to create databases, create tables, insert records, and delete - records. -* Copy and change `config/metadata-db.yml`, setting the type to `mysql` and uncommenting the MySQL - parameters. -* Install the MariaDB and PyYAML Python packages `pip3 install mariadb PyYAML` - * This is necessary to run the database initialization script. If you prefer, you can run the - SQL statements in `tools/scripts/db/init-db.py` directly. -* Run `tools/scripts/db/init-db.py` with the updated config file. This will initialize the - database CLP requires. -* Run `clp` or `clg` as before, with the addition of the `--db-config-file` option pointing at - the updated config file. -* To compress in parallel, simply run another instance of `clp` concurrently. +CLP is currently split across a few different components in the [components](components) +directory: -Note that currently, decompression (`clp x`) and search (`clg`) can only be run with a single -instance. We are in the process of open-sourcing parallelizable versions of these as well. +* [clp-py-utils](components/clp-py-utils) contains Python utilities common to several of the + other components. +* [compression-job-handler](components/compression-job-handler) contains code to submit + compression jobs to a cluster. +* [core](components/core) contains code to compress uncompressed logs, decompress compressed + logs, and search compressed logs. +* [job-orchestration](components/job-orchestration) contains code to schedule compression jobs on + the cluster. +* [package-template](components/package-template) contains the base directory structure and files of the + CLP package. ## Next Steps diff --git a/components/clp-py-utils/LICENSE b/components/clp-py-utils/LICENSE new file mode 100644 index 000000000..7a4a3ea24 --- /dev/null +++ b/components/clp-py-utils/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/components/clp-py-utils/README.md b/components/clp-py-utils/README.md new file mode 100644 index 000000000..57db586bc --- /dev/null +++ b/components/clp-py-utils/README.md @@ -0,0 +1,10 @@ +# CLP Python Utilities + +This python module contains utilities imported by other Python modules in the CLP package. + +## Installation + +```bash +pip3 install -r requirements.txt --target /lib/python3/site-packages +cp -R clp_py_utils /lib/python3/site-packages +``` diff --git a/components/clp-py-utils/clp_py_utils/__init__.py b/components/clp-py-utils/clp_py_utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py new file mode 100644 index 000000000..c00d0d8cc --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -0,0 +1,176 @@ +import typing + +from pydantic import BaseModel, validator + +from clp_py_utils.pretty_size import pretty_size + + +class Database(BaseModel): + type: str + host: str + port: int + username: str + password: str + name: str + ssl_cert: typing.Optional[str] = None + auto_commit: bool = False + compress: bool = True + + @validator('type') + def validate_database_type(cls, field): + supported_database_type = ['mysql', 'mariadb', 'bundled'] + if field not in supported_database_type: + raise ValueError(f'must be one of the following {"|".join(supported_database_type)}') + return field + + def get_mysql_connection_params(self): + # Currently, mysql's connector parameter is the same as mariadb + connection_params = { + 'host': self.host, + 'port': self.port, + 'user': self.username, + 'password': self.password, + 'database': self.name, + 'compress': self.compress, + 'autocommit': self.auto_commit + } + if self.ssl_cert: + connection_params['ssl_cert'] = self.ssl_cert + return connection_params + + def get_mariadb_connection_params(self): + # Currently, mysql's connector parameter is the same as mysql + connection_params = { + 'host': self.host, + 'port': self.port, + 'user': self.username, + 'password': self.password, + 'database': self.name, + 'compress': self.compress, + 'autocommit': self.auto_commit + } + if self.ssl_cert: + connection_params['ssl_cert'] = self.ssl_cert + return connection_params + + def get_clp_connection_params_and_type(self): + connection_params_and_type = { + 'type': 'mysql', # hard code this as mysql as CLP only support "mysql" for global database + 'host': self.host, + 'port': self.port, + 'username': self.username, + 'password': self.password, + 'name': self.name, + 'compress': self.compress, + 'autocommit': self.auto_commit + } + if self.ssl_cert: + connection_params_and_type['ssl_cert'] = self.ssl_cert + return connection_params_and_type + + +class Scheduler(BaseModel): + host: str + username: str + password: str + jobs_poll_delay: int + + +class ArchiveOutput(BaseModel): + type: str # Support only 'fs' type for now + directory: str + storage_is_node_specific: bool = False + target_archive_size: int + target_dictionaries_size: int + target_encoded_file_size: int + target_segment_size: int + + @validator('type') + def validate_type(cls, field): + if 'fs' != field: + raise ValueError('only fs type is supported in the opensource distribution') + return field + + @validator('target_archive_size') + def validate_target_archive_size(cls, field): + if field <= 0: + raise ValueError('target_archive_size parameter must be greater than 0') + return field + + @validator('target_dictionaries_size') + def validate_target_dictionaries_size(cls, field): + if field <= 0: + raise ValueError('target_dictionaries_size parameter must be greater than 0') + return field + + @validator('target_encoded_file_size') + def validate_target_encoded_file_size(cls, field): + if field <= 0: + raise ValueError('target_encoded_file_size parameter must be greater than 0') + return field + + @validator('target_segment_size') + def validate_target_segment_size(cls, field): + if field <= 0: + raise ValueError('target_segment_size parameter must be greater than 0') + return field + + +class CLPConfig(BaseModel): + input_logs_dfs_path: str + database: Database + scheduler: Scheduler + archive_output: ArchiveOutput + data_directory: str + logs_directory: str + + def generate_config_file_content_with_comments(self): + file_content = [ + f'# A path containing any logs you which to compress. Must be reachable by all workers.', + f'# - This path will be exposed inside the docker container.', + f'# - This path should not be any path that exists in the container image (an Ubuntu image) (e.g., /var/log).', + f'# - Limitations: Docker follow symlink outside context, therefore, we recommend avoiding symbolic links', + f'input_logs_dfs_path: {self.input_logs_dfs_path}', + f'', + f'database:', + f' type: {self.database.type}', + f' host: {self.database.host}', + f' port: {self.database.port}', + f' username: {self.database.username}', + f' password: {self.database.password}', + f' name: {self.database.name}', + f'', + f'scheduler:', + f' host: {self.scheduler.host}', + f' username: {self.scheduler.username}', + f' password: {self.scheduler.password}', + f' jobs_poll_delay: {self.scheduler.jobs_poll_delay} # Seconds', + f'', + f'# Where archives should be output to', + f'# Note: Only one output type may be specified', + f'archive_output:', + f' type: {self.archive_output.type}', + f' directory: "{self.archive_output.directory}"', + f'', + f' storage_is_node_specific: {self.archive_output.storage_is_node_specific}', + f'', + f' # How much data CLP should try to compress into each archive', + f' target_archive_size: {self.archive_output.target_archive_size} # {pretty_size(self.archive_output.target_archive_size)}', + f'', + f' # How large the dictionaries should be allowed to get before the archive is closed and a new one is created', + f' target_dictionaries_size: {self.archive_output.target_dictionaries_size} # {pretty_size(self.archive_output.target_dictionaries_size)}', + f'', + f' # How large each encoded file should be before being split into a new encoded file', + f' target_encoded_file_size: {self.archive_output.target_encoded_file_size} # {pretty_size(self.archive_output.target_encoded_file_size)}', + f'', + f' # How much data CLP should try to fit into each segment within an archive', + f' target_segment_size: {self.archive_output.target_segment_size} # {pretty_size(self.archive_output.target_segment_size)}', + f'', + f'# Location where other data is stored', + f'data_directory: "{self.data_directory}"', + f'', + f'# Location where logs are stored', + f'logs_directory: "{self.logs_directory}"', + f'', + ] + return '\n'.join(file_content) diff --git a/components/clp-py-utils/clp_py_utils/clp_io_config.py b/components/clp-py-utils/clp_py_utils/clp_io_config.py new file mode 100644 index 000000000..0be4c9164 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/clp_io_config.py @@ -0,0 +1,30 @@ +import typing + +from pydantic import BaseModel + + +class PathsToCompress(BaseModel): + file_paths: typing.List[str] + group_ids: typing.List[int] + st_sizes: typing.List[int] + empty_directories: typing.List[str] = None + + +class InputConfig(BaseModel): + type: str + list_path: str + path_prefix_to_remove: str = None + + +class OutputConfig(BaseModel): + type: str + target_archive_size: int + target_dictionaries_size: int + target_segment_size: int + target_encoded_file_size: int + storage_is_node_specific: bool + + +class ClpIoConfig(BaseModel): + input: InputConfig + output: OutputConfig diff --git a/components/clp-py-utils/clp_py_utils/clp_package_config.py b/components/clp-py-utils/clp_py_utils/clp_package_config.py new file mode 100644 index 000000000..4c4d71ae0 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/clp_package_config.py @@ -0,0 +1,60 @@ +from pydantic import BaseModel, validator + +from clp_py_utils.pretty_size import pretty_size + + +# Limited set of configurations operation found in clp_config.py +class ArchiveOutput(BaseModel): + target_archive_size: int + target_dictionaries_size: int + target_encoded_file_size: int + target_segment_size: int + + @validator('target_archive_size') + def validate_target_archive_size(cls, field): + if field <= 0: + raise ValueError('target_archive_size parameter must be greater than 0') + return field + + @validator('target_dictionaries_size') + def validate_target_dictionaries_size(cls, field): + if field <= 0: + raise ValueError('target_dictionaries_size parameter must be greater than 0') + return field + + @validator('target_encoded_file_size') + def validate_target_encoded_file_size(cls, field): + if field <= 0: + raise ValueError('target_encoded_file_size parameter must be greater than 0') + return field + + @validator('target_segment_size') + def validate_target_segment_size(cls, field): + if field <= 0: + raise ValueError('target_segment_size parameter must be greater than 0') + return field + + +class CLPPackageConfig(BaseModel): + cluster_name: str + archive_output: ArchiveOutput + + def generate_package_config_file_content_with_comments(self): + file_content = [ + f'cluster_name: {self.cluster_name}', + f'', + f'archive_output:', + f' # How much data CLP should try to compress into each archive', + f' target_archive_size: {str(self.archive_output.target_archive_size)} # {pretty_size(self.archive_output.target_archive_size)}', + f'', + f' # How large the dictionaries should be allowed to get before the archive is closed and a new one is created', + f' target_dictionaries_size: {str(self.archive_output.target_dictionaries_size)} # {pretty_size(self.archive_output.target_dictionaries_size)}', + f'', + f' # How large each encoded file should be before being split into a new encoded file', + f' target_encoded_file_size: {str(self.archive_output.target_encoded_file_size)} # {pretty_size(self.archive_output.target_encoded_file_size)}', + f'', + f' # How much data CLP should try to fit into each segment within an archive', + f' target_segment_size: {str(self.archive_output.target_segment_size)} # {pretty_size(self.archive_output.target_segment_size)}', + f'' + ] + return '\n'.join(file_content) diff --git a/components/clp-py-utils/clp_py_utils/compression.py b/components/clp-py-utils/clp_py_utils/compression.py new file mode 100644 index 000000000..dbad6d10f --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/compression.py @@ -0,0 +1,141 @@ +import pathlib +import typing + +import Levenshtein + +# Constants +FILE_GROUPING_MIN_LEVENSHTEIN_RATIO = 0.6 + + +class FileMetadata: + __slots__ = ('path', 'size', 'estimated_uncompressed_size') + + def __init__(self, path: pathlib.Path, size: int): + self.path = path + self.size = size + self.estimated_uncompressed_size = size + + filename = path.name + if any(filename.endswith(extension) for extension in ['.gz', '.gzip', '.tgz', '.tar.gz']): + self.estimated_uncompressed_size *= 13 + elif any(filename.endswith(extension) for extension in ['.zstd', '.zstandard', '.tar.zstd', '.tar.zstandard']): + self.estimated_uncompressed_size *= 8 + + +class FilesPartition: + def __init__(self): + self.__files = [] + self.__file_paths = [] + self.__group_ids = [] + self.__st_sizes = [] + self.__total_file_size = 0 + + def add_file(self, file_metadata: FileMetadata, group_id: int): + self.__files.append(file_metadata) + self.__file_paths.append(str(file_metadata.path)) + self.__group_ids.append(group_id) + self.__st_sizes.append(file_metadata.size) + self.__total_file_size += file_metadata.estimated_uncompressed_size + + def add_file_if_empty(self, file_metadata: FileMetadata, group_id: int): + if file_metadata.estimated_uncompressed_size > 0: + return False + + self.__files.append(file_metadata) + self.__file_paths.append(str(file_metadata.path)) + self.__group_ids.append(group_id) + self.__st_sizes.append(file_metadata.size) + return True + + def pop_files(self): + files = self.__files + file_paths = self.__file_paths + group_ids = self.__group_ids + st_sizes = self.__st_sizes + total_file_size = self.__total_file_size + + self.__files = [] + self.__file_paths = [] + self.__group_ids = [] + self.__st_sizes = [] + self.__total_file_size = 0 + + return files, file_paths, group_ids, st_sizes, total_file_size + + def get_total_file_size(self): + return self.__total_file_size + + def contains_files(self): + return len(self.__files) > 0 + + +def file_paths_in_same_group(a: pathlib.Path, b: pathlib.Path): + return Levenshtein.ratio(str(a.name), str(b.name)) >= FILE_GROUPING_MIN_LEVENSHTEIN_RATIO + + +def group_files_by_similar_filenames(files: typing.List[FileMetadata]): + groups = [] + + if len(files) == 0: + return groups + + current_group_id = 0 + current_group = {'id': current_group_id, 'files': []} + groups.append(current_group) + + # Sort by filename + files.sort(key=lambda x: x.path.name) + + file_ix = 0 + file = files[file_ix] + current_group['files'].append(file) + last_file_path = file.path + + for file_ix in range(1, len(files)): + file = files[file_ix] + if not file_paths_in_same_group(last_file_path, file.path): + current_group_id += 1 + current_group = {'id': current_group_id, 'files': []} + groups.append(current_group) + + current_group['files'].append(file) + last_file_path = file.path + + return groups + + +def validate_path_and_get_info(required_parent_dir: pathlib.Path, path: pathlib.Path): + file = None + empty_directory = None + + # Verify that path is absolute + if not path.is_absolute(): + raise ValueError(f'"{path}" is not absolute.') + + # Verify that path exists + if not path.exists(): + raise ValueError(f'"{path}" does not exist.') + + # Verify that path points to a file/dir within required parent dir + try: + path.resolve().relative_to(required_parent_dir) + except ValueError: + raise ValueError(f'"{path}" is not within {required_parent_dir}') + + # Convert path to a path within required parent dir if necessary + # (e.g., if path is a symlink outside parent dir, but points to a file/dir inside parent dir) + try: + path.relative_to(required_parent_dir) + except ValueError: + # Not within parent dir, so resolve it + path = path.resolve() + + if path.is_dir(): + # Check if directory is empty + if next(path.iterdir(), None) is None: + empty_directory = str(path) + else: + file_size = path.stat().st_size + file = FileMetadata(path, file_size) + + return file, empty_directory diff --git a/components/clp-py-utils/clp_py_utils/core.py b/components/clp-py-utils/clp_py_utils/core.py new file mode 100644 index 000000000..db202d432 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/core.py @@ -0,0 +1,11 @@ +import pathlib + +import yaml + + +def read_yaml_config_file(yaml_config_file_path: pathlib.Path): + with open(yaml_config_file_path, 'r') as yaml_config_file: + config = yaml.safe_load(yaml_config_file) + if config is None: + raise Exception(f'Unable to parse configuration from {yaml_config_file_path}.') + return config diff --git a/components/clp-py-utils/clp_py_utils/initialize-clp-metadata-db.py b/components/clp-py-utils/clp_py_utils/initialize-clp-metadata-db.py new file mode 100644 index 000000000..ce072182d --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/initialize-clp-metadata-db.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +import argparse +import logging +import sys +from contextlib import closing + +from pydantic import ValidationError +from sql_adapter import SQL_Adapter + +from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.core import read_yaml_config_file + +# Setup logging +# Create logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +logger.addHandler(logging_console_handler) + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Setup CLP metadata tables compression and search.') + args_parser.add_argument('--config', required=True, help='CLP package config file.') + parsed_args = args_parser.parse_args(argv[1:]) + + try: + clp_config = CLPConfig.parse_obj(read_yaml_config_file(parsed_args.config)) + sql_adapter = SQL_Adapter(clp_config.database) + with closing(sql_adapter.create_connection()) as metadata_db, \ + closing(metadata_db.cursor(dictionary=True)) as metadata_db_cursor: + metadata_db_cursor.execute(""" + CREATE TABLE IF NOT EXISTS `archives` ( + `pagination_id` BIGINT unsigned NOT NULL AUTO_INCREMENT, + `id` VARCHAR(64) NOT NULL, + `storage_id` VARCHAR(64) NOT NULL, + `uncompressed_size` BIGINT NOT NULL, + `size` BIGINT NOT NULL, + `creator_id` VARCHAR(64) NOT NULL, + `creation_ix` INT NOT NULL, + KEY `archives_creation_order` (`creator_id`,`creation_ix`) USING BTREE, + UNIQUE KEY `archive_id` (`id`) USING BTREE, + PRIMARY KEY (`pagination_id`) + ); + """ + ) + + metadata_db_cursor.execute(""" + CREATE TABLE IF NOT EXISTS `files` ( + `id` VARCHAR(64) NOT NULL, + `orig_file_id` VARCHAR(64) NOT NULL, + `path` VARCHAR(12288) NOT NULL, + `begin_timestamp` BIGINT NOT NULL, + `end_timestamp` BIGINT NOT NULL, + `num_uncompressed_bytes` BIGINT NOT NULL, + `num_messages` BIGINT NOT NULL, + `archive_id` VARCHAR(64) NOT NULL, + KEY `files_path` (path(768)) USING BTREE, + KEY `files_archive_id` (`archive_id`) USING BTREE, + PRIMARY KEY (`id`) + ) ROW_FORMAT=DYNAMIC + ; + """ + ) + + metadata_db.commit() + logger.info('Successfully created clp metadata tables for compression and search') + + except ValidationError as err: + logger.error(err) + return -1 + except Exception as ex: + logger.error(ex) + return -1 + + return 0 + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py new file mode 100644 index 000000000..68161c229 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +import argparse +import logging +import sys +from contextlib import closing + +from pydantic import ValidationError +from sql_adapter import SQL_Adapter + +from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.core import read_yaml_config_file + +# Setup logging +# Create logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +logger.addHandler(logging_console_handler) + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Setup metadata tables for job orchestration.') + args_parser.add_argument('--config', required=True, help='CLP package config file.') + parsed_args = args_parser.parse_args(argv[1:]) + + try: + clp_config = CLPConfig.parse_obj(read_yaml_config_file(parsed_args.config)) + sql_adapter = SQL_Adapter(clp_config.database) + with closing(sql_adapter.create_connection()) as scheduling_db, \ + closing(scheduling_db.cursor(dictionary=True)) as scheduling_db_cursor: + scheduling_db_cursor.execute(""" + CREATE TABLE IF NOT EXISTS `compression_jobs` ( + `job_id` INT NOT NULL AUTO_INCREMENT, + `job_status` VARCHAR(16) NOT NULL DEFAULT 'SCHEDULING', + `job_status_msg` VARCHAR(255) NOT NULL DEFAULT '', + `job_creation_time` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `job_start_time` DATETIME NULL DEFAULT NULL, + `job_duration` INT NULL DEFAULT NULL, + `job_original_size` BIGINT NOT NULL DEFAULT '0', + `job_uncompressed_size` BIGINT NOT NULL DEFAULT '0', + `job_compressed_size` BIGINT NOT NULL DEFAULT '0', + `num_tasks` INT NOT NULL DEFAULT '0', + `num_tasks_completed` INT NOT NULL DEFAULT '0', + `clp_binary_version` INT NULL DEFAULT NULL, + `clp_config` VARBINARY(60000) NOT NULL, + PRIMARY KEY (`job_id`) USING BTREE, + INDEX `JOB_STATUS` (`job_status`) USING BTREE + ) ROW_FORMAT=DYNAMIC + ; + """ + ) + + scheduling_db_cursor.execute(""" + CREATE TABLE IF NOT EXISTS `compression_tasks` ( + `task_id` BIGINT NOT NULL AUTO_INCREMENT, + `task_status` VARCHAR(16) NOT NULL DEFAULT 'SUBMITTED', + `task_scheduled_time` DATETIME NULL DEFAULT NULL, + `task_start_time` DATETIME NULL DEFAULT NULL, + `task_duration` SMALLINT NULL DEFAULT NULL, + `job_id` INT NOT NULL, + `clp_paths_to_compress` VARBINARY(60000) NOT NULL, + `partition_original_size` BIGINT NOT NULL, + `partition_uncompressed_size` BIGINT NULL DEFAULT NULL, + `partition_compressed_size` BIGINT NULL DEFAULT NULL, + PRIMARY KEY (`task_id`) USING BTREE, + INDEX `job_id` (`job_id`) USING BTREE, + INDEX `TASK_STATUS` (`task_status`) USING BTREE, + INDEX `TASK_START_TIME` (`task_start_time`) USING BTREE, + CONSTRAINT `compression_tasks` FOREIGN KEY (`job_id`) + REFERENCES `compression_jobs` (`job_id`) ON UPDATE NO ACTION ON DELETE NO ACTION + ) ROW_FORMAT=DYNAMIC + ; + """ + ) + + scheduling_db.commit() + logger.info('Successfully created compression_jobs and compression_tasks orchestration tables') + + except ValidationError as err: + logger.error(err) + return -1 + except Exception as ex: + logger.error(ex) + return -1 + + return 0 + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/components/clp-py-utils/clp_py_utils/pretty_size.py b/components/clp-py-utils/clp_py_utils/pretty_size.py new file mode 100644 index 000000000..c69c5ee59 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/pretty_size.py @@ -0,0 +1,6 @@ +def pretty_size(num, suffix='B'): + for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: + if abs(num) < 1024.0: + return '%3.2f%s%s' % (num, unit, suffix) + num /= 1024.0 + return '%.2f%s%s' % (num, 'Yi', suffix) diff --git a/components/clp-py-utils/clp_py_utils/sql_adapter.py b/components/clp-py-utils/clp_py_utils/sql_adapter.py new file mode 100644 index 000000000..a42fa79c9 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/sql_adapter.py @@ -0,0 +1,43 @@ +import logging + +import mariadb +import mysql.connector +from mysql.connector import errorcode + +from clp_py_utils.clp_config import Database + + +class SQL_Adapter: + def __init__(self, database_config: Database): + self.database_config = database_config + + def create_mysql_connection(self) -> mysql.connector.MySQLConnection: + try: + connection = mysql.connector.connect(**self.database_config.get_mysql_connection_params()) + except mysql.connector.Error as err: + if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: + logging.error('Database access denied.') + elif err.errno == errorcode.ER_BAD_DB_ERROR: + logging.error(f'Specified database "{self.database_config.name}" does not exist.') + else: + logging.error(err) + raise err + else: + return connection + + def create_mariadb_connection(self) -> mariadb.connection: + try: + connection = mariadb.connect(**self.database_config.get_mysql_connection_params()) + except mariadb.Error as err: + logging.error(f'Error connecting to MariaDB: {err}') + raise err + else: + return connection + + def create_connection(self): + if 'mysql' == self.database_config.type: + return self.create_mysql_connection() + elif 'mariadb' == self.database_config.type: + return self.create_mariadb_connection() + else: + raise NotImplementedError diff --git a/components/clp-py-utils/requirements.txt b/components/clp-py-utils/requirements.txt new file mode 100644 index 000000000..c5a0c915f --- /dev/null +++ b/components/clp-py-utils/requirements.txt @@ -0,0 +1,5 @@ +python-Levenshtein +PyYAML==5.4 +pydantic==1.8.2 +mysql-connector-python==8.0.26 +mariadb~=1.0.7 diff --git a/components/compression-job-handler/.gitignore b/components/compression-job-handler/.gitignore new file mode 100644 index 000000000..4b0a0fbd9 --- /dev/null +++ b/components/compression-job-handler/.gitignore @@ -0,0 +1 @@ +clp-config.yaml \ No newline at end of file diff --git a/components/compression-job-handler/LICENSE b/components/compression-job-handler/LICENSE new file mode 100644 index 000000000..7a4a3ea24 --- /dev/null +++ b/components/compression-job-handler/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/components/compression-job-handler/README.md b/components/compression-job-handler/README.md new file mode 100644 index 000000000..423299f51 --- /dev/null +++ b/components/compression-job-handler/README.md @@ -0,0 +1,32 @@ +# CLP Compression Job Handler + +This Python module submits compression jobs to the CLP compression scheduler. + +## Installation + +```bash +pip3 install -r requirements.txt --target /lib/python3/site-packages +cp -R clp_py_utils /lib/python3/site-packages +``` + +## Usage + +Below are a few ways to use this module. + +### Docker compression wrapper + +```bash +/sbin/compress +``` + +### Native compression wrapper + +```bash +/sbin/native/compress +``` + +### Standalone + +```bash +PYTHONPATH= python3 -m compression_job_handler +``` diff --git a/components/compression-job-handler/compression_job_handler/__init__.py b/components/compression-job-handler/compression_job_handler/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/components/compression-job-handler/compression_job_handler/compression_job_handler.py b/components/compression-job-handler/compression_job_handler/compression_job_handler.py new file mode 100644 index 000000000..e88983615 --- /dev/null +++ b/components/compression-job-handler/compression_job_handler/compression_job_handler.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 +import argparse +import logging +import pathlib +import sys +import time +import typing +from contextlib import closing + +import msgpack +import mysql.connector +import zstandard +import zstandard as zstd +from pydantic import ValidationError + +from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.clp_io_config import PathsToCompress, InputConfig, OutputConfig, ClpIoConfig +from clp_py_utils.compression import FileMetadata, FilesPartition, \ + group_files_by_similar_filenames, validate_path_and_get_info +from clp_py_utils.core import read_yaml_config_file +from clp_py_utils.pretty_size import pretty_size +from clp_py_utils.sql_adapter import SQL_Adapter +from .utils.common import JobCompletionStatus + +# Setup logging +# Create logger +logger = logging.getLogger('compression-job-handler') +logger.setLevel(logging.INFO) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(name)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +logger.addHandler(logging_console_handler) + + +class PathsToCompressBuffer: + def __init__(self, scheduler_db_cursor, maintain_file_ordering: bool, + empty_directories_allowed: bool, target_archive_size: int, + file_size_to_trigger_compression, scheduling_job_id: int, zstd_cctx): + self.__files: typing.List[FileMetadata] = [] + self.__maintain_file_ordering: bool = maintain_file_ordering + if empty_directories_allowed: + self.__empty_directories: typing.Optional[typing.List[str]] = [] + else: + self.__empty_directories: typing.Optional[typing.List[str]] = None + self.__total_file_size: int = 0 + self.__target_archive_size: int = target_archive_size + self.__file_size_to_trigger_compression: int = file_size_to_trigger_compression + self.__scheduling_job_id: int = scheduling_job_id + self.scheduling_job_id: int = scheduling_job_id + self.__zstd_cctx = zstd_cctx + + self.__scheduler_db_cursor = scheduler_db_cursor + self.num_tasks = 0 + + def add_file(self, file: FileMetadata): + self.__files.append(file) + self.__total_file_size += file.estimated_uncompressed_size + + if self.__total_file_size >= self.__file_size_to_trigger_compression: + self.__partition_and_compress(False) + + def add_empty_directory(self, path: pathlib.Path): + if self.__empty_directories is None: + return + self.__empty_directories.append(str(path)) + + def flush(self): + self.__partition_and_compress(True) + + def contains_paths(self): + return len(self.__files) > 0 or ( + self.__empty_directories and len(self.__empty_directories) > 0) + + def __submit_partition_for_compression(self, partition: FilesPartition): + files, file_paths, group_ids, st_sizes, partition_total_file_size = partition.pop_files() + paths_to_compress = PathsToCompress(file_paths=file_paths, group_ids=group_ids, st_sizes=st_sizes) + + if self.__empty_directories is not None and len(self.__empty_directories) > 0: + paths_to_compress.empty_directories = self.__empty_directories + self.__empty_directories = [] + + # Note: partition_total_file_size => estimated size, aggregate + # the st_size => real original size + self.__scheduler_db_cursor.execute( + f'INSERT INTO compression_tasks ' + f'(job_id, partition_original_size, clp_paths_to_compress) ' + f'VALUES({str(self.__scheduling_job_id)}, {str(sum(st_sizes))}, %s);', + (self.__zstd_cctx.compress(msgpack.packb(paths_to_compress.dict(exclude_none=True))),) + ) + self.num_tasks += 1 + + return partition_total_file_size + + def add_files(self, target_num_archives: int, target_archive_size: int, files): + target_num_archives = min(len(files), target_num_archives) + + groups = group_files_by_similar_filenames(files) + next_file_ix_per_group = [0 for _ in range(len(groups))] + + partitions = [FilesPartition() for _ in range(target_num_archives)] + + # Distribute files across partitions in round-robin order; full partitions are skipped + next_partition_ix = 0 + group_ix = 0 + while len(groups) > 0: + group_file_ix = next_file_ix_per_group[group_ix] + group_id = groups[group_ix]['id'] + group_files = groups[group_ix]['files'] + + file = group_files[group_file_ix] + + # Look for a partition with space + while True: + partition = partitions[next_partition_ix] + next_partition_ix = (next_partition_ix + 1) % target_num_archives + if partition.get_total_file_size() < target_archive_size: + break + + partition.add_file(file, group_id) + + group_file_ix += 1 + if len(group_files) == group_file_ix: + groups.pop(group_ix) + next_file_ix_per_group.pop(group_ix) + else: + next_file_ix_per_group[group_ix] = group_file_ix + group_ix += 1 + if len(groups) > 0: + group_ix %= len(groups) + + # Compress partitions + for partition in partitions: + self.__submit_partition_for_compression(partition) + + def __partition_and_compress(self, flush_buffer: bool): + if not flush_buffer and self.__total_file_size < self.__target_archive_size: + # Not enough data for a full partition and we don't need to exhaust the buffer + return + if not self.contains_paths(): + # Nothing to compress + return + + partition = FilesPartition() + + if self.__maintain_file_ordering: + # NOTE: grouping by filename is not supported when maintaining file ordering, + # so we give each file its own group ID to maintain ordering + + group_ix = 0 + # Compress full partitions + if self.__total_file_size >= self.__target_archive_size: + file_ix = 0 + for file_ix, file in enumerate(self.__files): + partition.add_file(file, group_ix) + group_ix += 1 + + # Compress partition if ready + if partition.get_total_file_size() >= self.__target_archive_size: + self.__total_file_size -= self.__submit_partition_for_compression( + partition) + if self.__total_file_size < self.__target_archive_size: + # Not enough files to fill a partition, so break + break + # Pop compressed files + self.__files = self.__files[file_ix + 1:] + + # Compress remaining partial partition if necessary + if flush_buffer and self.contains_paths(): + for file in self.__files: + partition.add_file(file, group_ix) + group_ix += 1 + self.__total_file_size -= self.__submit_partition_for_compression(partition) + self.__files = [] + else: + groups = group_files_by_similar_filenames(self.__files) + next_file_ix_per_group = [0 for _ in range(len(groups))] + + group_ix = 0 + while len(groups) > 0: + group_file_ix = next_file_ix_per_group[group_ix] + group_id = groups[group_ix]['id'] + group_files = groups[group_ix]['files'] + + file = group_files[group_file_ix] + + partition.add_file(file, group_id) + + group_file_ix += 1 + if len(group_files) == group_file_ix: + groups.pop(group_ix) + next_file_ix_per_group.pop(group_ix) + else: + next_file_ix_per_group[group_ix] = group_file_ix + group_ix += 1 + if len(groups) > 0: + group_ix %= len(groups) + + # Compress partition if ready + if partition.get_total_file_size() >= self.__target_archive_size: + self.__total_file_size -= self.__submit_partition_for_compression(partition) + if not flush_buffer and self.__total_file_size < self.__target_archive_size: + # Not enough files to fill a partition and + # we don't need to exhaust the buffer, so break + break + + # Compress partial partition + if partition.contains_files(): + self.__total_file_size -= self.__submit_partition_for_compression(partition) + self.__files = [] + + # Pop compressed files + remaining_files = [] + for group_ix, group in enumerate(groups): + group_files = group['files'] + group_file_ix = next_file_ix_per_group[group_ix] + for i in range(group_file_ix, len(group_files)): + remaining_files.append(group_files[i]) + self.__files = remaining_files + + # Compress any remaining empty directories + if flush_buffer and self.contains_paths(): + self.__total_file_size -= self.__submit_partition_for_compression(partition) + self.__files = [] + + +def handle_job(scheduling_db, scheduling_db_cursor, clp_io_config: ClpIoConfig, logs_dir_abs: str, + fs_logs_required_parent_dir: pathlib.Path, zstd_cctx: zstandard.ZstdCompressor, + no_progress_reporting: bool) -> JobCompletionStatus: + job_logger = None + all_worker_jobs_successful = True + + try: + job_completed_with_errors = False + if 'fs' == clp_io_config.input.type: + # Create new job in the sql database + scheduling_db_cursor.execute( + 'INSERT INTO compression_jobs (clp_config) VALUES (%s);', + (zstd_cctx.compress(msgpack.packb(clp_io_config.dict(exclude_none=True, exclude_unset=True))),) + ) + scheduling_db.commit() + scheduling_job_id = scheduling_db_cursor.lastrowid + + # Create job-specific logger + job_str = f'job-{scheduling_job_id}' + job_logger = logging.getLogger(job_str) + job_logger.setLevel(logging.INFO) + combined_log_file_path = f'{logs_dir_abs}/{job_str}.log' + job_logger_file_handler = logging.FileHandler(combined_log_file_path) + job_logger_file_handler.setFormatter(logging_formatter) + job_logger.addHandler(logging_console_handler) + job_logger.addHandler(job_logger_file_handler) + + job_logger.debug(f'Starting job {scheduling_job_id}') + + paths_to_compress_buffer = PathsToCompressBuffer( + scheduler_db_cursor=scheduling_db_cursor, + maintain_file_ordering=False, + empty_directories_allowed=True, + target_archive_size=clp_io_config.output.target_archive_size, + file_size_to_trigger_compression=clp_io_config.output.target_archive_size * 2, + scheduling_job_id=scheduling_job_id, + zstd_cctx=zstd_cctx + ) + + # Compress all files at once to try and satisfy the target number of archives + job_logger.info("Iterating and partitioning files into tasks.") + # TODO: Handle file not found + with open(pathlib.Path(clp_io_config.input.list_path).resolve(), 'r') as f: + for path_idx, path in enumerate(f, start=1): + stripped_path = path.strip() + if '' == stripped_path: + # Skip empty paths + continue + path = pathlib.Path(stripped_path) + + try: + file, empty_directory = validate_path_and_get_info(fs_logs_required_parent_dir, path) + except ValueError as ex: + job_logger.error(str(ex)) + job_completed_with_errors = True + continue + + if file: + paths_to_compress_buffer.add_file(file) + elif empty_directory: + paths_to_compress_buffer.add_empty_directory(empty_directory) + + if path.is_dir(): + for internal_path in path.rglob('*'): + try: + file, empty_directory = validate_path_and_get_info( + fs_logs_required_parent_dir, internal_path) + except ValueError as ex: + job_logger.error(str(ex)) + job_completed_with_errors = True + continue + + if file: + paths_to_compress_buffer.add_file(file) + elif empty_directory: + paths_to_compress_buffer.add_empty_directory(empty_directory) + + if path_idx % 10000 == 0: + scheduling_db.commit() + + paths_to_compress_buffer.flush() + + # Ensure all of the scheduled task and the total number of tasks + # in the job row has been updated and committed + scheduling_db_cursor.execute( + f'UPDATE compression_jobs ' + f'SET num_tasks={paths_to_compress_buffer.num_tasks}, job_status="SCHEDULED" ' + f'WHERE job_id={scheduling_job_id};' + ) + scheduling_db.commit() + + # TODO: what happens when commit fails, log error and crash ASAP + + # Wait for jobs to finish + job_logger.info(f'Waiting for {paths_to_compress_buffer.num_tasks} task(s) to finish.') + + # Simply poll the job_status in the job scheduling table + if no_progress_reporting: + polling_query = \ + f'SELECT job_status, job_status_msg FROM compression_jobs ' \ + f'WHERE job_id={scheduling_job_id};' + else: + polling_query = \ + f'SELECT job_status, job_status_msg, job_uncompressed_size, job_compressed_size ' \ + f'FROM compression_jobs WHERE job_id={scheduling_job_id};' + + completion_query = \ + f'SELECT job_duration, job_uncompressed_size, job_compressed_size ' \ + f'FROM compression_jobs WHERE job_id={scheduling_job_id};' + + job_last_uncompressed_size = 0 + while True: + scheduling_db_cursor.execute(polling_query) + + # Using fetchall() here t + results = scheduling_db_cursor.fetchall() + if len(results) > 1: + logging.error("Duplicated job_id") + logging.error(str(results)) + if len(results) == 0: + time.sleep(1) + continue + if isinstance(scheduling_db, mysql.connector.MySQLConnection): + scheduling_db.commit() # clear the query cache + + job_row = results[0] + job_status = job_row['job_status'] + + if not no_progress_reporting: + job_uncompressed_size = job_row['job_uncompressed_size'] + job_compressed_size = job_row['job_compressed_size'] + if job_uncompressed_size > 0: + compression_ratio = float(job_uncompressed_size) / job_compressed_size + if job_last_uncompressed_size < job_uncompressed_size: + job_logger.info( + f'Compressed {pretty_size(job_uncompressed_size)} into ' + f'{pretty_size(job_compressed_size)} ({compression_ratio:.2f})') + job_last_uncompressed_size = job_uncompressed_size + + if job_status == 'SCHEDULED': + pass # Simply wait another iteration + elif job_status == 'COMPLETED': + # All tasks in the job is done + if not no_progress_reporting: + scheduling_db_cursor.execute(completion_query) + job_row = scheduling_db_cursor.fetchone() + if job_row['job_duration']: + speed = job_row['job_uncompressed_size'] / job_row['job_duration'] + job_logger.info( + f'Compression finished. Runtime: {str(job_row["job_duration"])}s. ' + f'Speed: {pretty_size(speed)}/s.') + break # Done + elif job_status == 'FAILED': + # One or more tasks in the job has failed + job_logger.error(f'Compression failed. See log file in {job_row["job_status_msg"]}') + break # Done + else: + job_logger.info(f'handler for job_status "{job_status}" is not implemented') + raise NotImplementedError + + scheduling_db.commit() # clear the query cache + time.sleep(1) + + except Exception as ex: + if job_logger: + job_logger.exception(f'Exception while processing {job_str}.') + job_logger.error(ex) + all_worker_jobs_successful = False + finally: + if job_logger: + job_logger.removeHandler(job_logger_file_handler) + job_logger_file_handler.flush() + job_logger_file_handler.close() + + if not all_worker_jobs_successful: + return JobCompletionStatus.FAILED + elif job_completed_with_errors: + return JobCompletionStatus.SUCCEEDED_WITH_ERRORS + + logger.debug(f'Finished job {job_str}') + + return JobCompletionStatus.SUCCEEDED + + +def handle_jobs(sql_adapter: SQL_Adapter, clp_io_config: ClpIoConfig, logs_dir_abs: str, + fs_logs_required_parent_dir: pathlib.Path, no_progress_reporting: bool): + logger.info('compression-job-handler started.') + + # Instantiate zstdandard compression context + zstd_cctx = zstd.ZstdCompressor(level=3) + + # Connect to SQL Database + with closing(sql_adapter.create_connection()) as scheduling_db, \ + closing(scheduling_db.cursor(dictionary=True)) as scheduling_db_cursor: + # Execute new compression job + handle_job(scheduling_db=scheduling_db, scheduling_db_cursor=scheduling_db_cursor, clp_io_config=clp_io_config, + logs_dir_abs=logs_dir_abs, fs_logs_required_parent_dir=fs_logs_required_parent_dir, + zstd_cctx=zstd_cctx, no_progress_reporting=no_progress_reporting) + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Wait for and run compression jobs.') + args_parser.add_argument('--fs-logs-required-parent-dir', default="/nonexistent", + help='The required parent for any logs ingested from the filesystem.') + args_parser.add_argument('--no-progress-reporting', action='store_true', help='Disables progress reporting.') + args_parser.add_argument('--config', '-c', required=True, help='CLP configuration file.') + args_parser.add_argument('--log-list-path', required=True, help='File containing list of input files to compress') + parsed_args = args_parser.parse_args(argv[1:]) + + # Load configuration + config_path = pathlib.Path(parsed_args.config) + try: + clp_config = CLPConfig.parse_obj(read_yaml_config_file(config_path)) + except ValidationError as err: + logger.error(err) + except Exception as ex: + # read_yaml_config_file already logs the parsing error inside + pass + else: + # Configure file system directory locations # TODO: refactor with better comment + fs_logs_required_parent_dir = pathlib.Path(parsed_args.fs_logs_required_parent_dir) + + sql_adapter = SQL_Adapter(clp_config.database) + + clp_io_config = ClpIoConfig( + input=InputConfig(type='fs', list_path=str(pathlib.Path(parsed_args.log_list_path).resolve())), + output=OutputConfig.parse_obj(clp_config.archive_output) + ) + + logs_directory_abs = str(pathlib.Path(clp_config.logs_directory).resolve()) + + handle_jobs(sql_adapter=sql_adapter, clp_io_config=clp_io_config, logs_dir_abs=logs_directory_abs, + fs_logs_required_parent_dir=fs_logs_required_parent_dir, + no_progress_reporting=parsed_args.no_progress_reporting) + + return 0 + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/components/compression-job-handler/compression_job_handler/utils/__init__.py b/components/compression-job-handler/compression_job_handler/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/components/compression-job-handler/compression_job_handler/utils/common.py b/components/compression-job-handler/compression_job_handler/utils/common.py new file mode 100644 index 000000000..ce119402e --- /dev/null +++ b/components/compression-job-handler/compression_job_handler/utils/common.py @@ -0,0 +1,7 @@ +import enum + + +class JobCompletionStatus(enum.IntEnum): + SUCCEEDED = 0 + FAILED = 1 + SUCCEEDED_WITH_ERRORS = 2 diff --git a/components/compression-job-handler/requirements.txt b/components/compression-job-handler/requirements.txt new file mode 100644 index 000000000..444771b39 --- /dev/null +++ b/components/compression-job-handler/requirements.txt @@ -0,0 +1,8 @@ +python-Levenshtein +PyYAML==5.4 + +setuptools~=45.2.0 +msgpack~=1.0.2 +zstandard~=0.15.2 +mysql-connector-python==8.0.26 +pydantic==1.8.2 diff --git a/.gitignore b/components/core/.gitignore similarity index 100% rename from .gitignore rename to components/core/.gitignore diff --git a/CHANGELOG.md b/components/core/CHANGELOG.md similarity index 100% rename from CHANGELOG.md rename to components/core/CHANGELOG.md diff --git a/CMakeLists.txt b/components/core/CMakeLists.txt similarity index 100% rename from CMakeLists.txt rename to components/core/CMakeLists.txt diff --git a/LICENSE.md b/components/core/LICENSE.md similarity index 100% rename from LICENSE.md rename to components/core/LICENSE.md diff --git a/components/core/README.md b/components/core/README.md new file mode 100644 index 000000000..296e61601 --- /dev/null +++ b/components/core/README.md @@ -0,0 +1,172 @@ +# CLP Core + +CLP's core is the low-level component that performs compression, decompression, and search. + +## Contents + +* [Getting Started](#getting-started) +* [Requirements](#requirements) +* [Building](#building) + * [Source Dependencies](#source-dependencies) + * [Packages](#packages) + * [Libraries](#libraries) + * [Build](#build) +* [Running](#running) + * [`clp`](#clp) + * [`clg`](#clg) +* [Parallel Compression](#parallel-compression) +* [Next Steps](#next-steps) + +## Requirements + +* We have built and tested CLP on **Ubuntu 18.04 (bionic)** and **Ubuntu 20.04 (focal)**. + * If you have trouble building for another OS, file an issue and we may be able to help. +* A compiler that supports c++14 + +## Building + +* To build, we require some source dependencies, packages from package managers, and libraries built from source. + +### Source Dependencies + +We use both git submodules and third-party source packages. To download all, you can run this script: +```shell +tools/scripts/deps-download/download-all.sh +``` + +This will download: +* [Catch2](https://github.com/catchorg/Catch2.git) (v2.13.6) +* [date](https://github.com/HowardHinnant/date.git) (v3.0.1) +* [json](https://github.com/nlohmann/json.git) (v3.10.2) +* [SQLite3](https://www.sqlite.org/download.html) (v3.36.0) +* [yaml-cpp](https://github.com/jbeder/yaml-cpp.git) (v0.7.0) + +### Packages + +If you're using apt-get, you can use the following command to install all: +```shell +sudo apt-get install -y ca-certificates checkinstall cmake build-essential \ +libboost-filesystem-dev libboost-iostreams-dev libboost-program-options-dev \ +libssl-dev pkg-config rsync wget zlib1g-dev +``` + +This will download: +* ca-certificates +* checkinstall +* cmake +* build-essential +* libboost-filesystem-dev +* libboost-iostreams-dev +* libboost-program-options-dev +* libssl-dev +* pkg-config +* rsync +* wget +* zlib1g-dev + +### Libraries + +The latest versions of some packages are not offered by apt repositories, +so we've included some scripts to download, compile, and install them: +```shell +./tools/scripts/lib_install/fmtlib.sh 8.0.1 +./tools/scripts/lib_install/libarchive.sh 3.5.1 +./tools/scripts/lib_install/lz4.sh 1.8.2 +./tools/scripts/lib_install/mariadb-connector-c.sh 3.2.3 +./tools/scripts/lib_install/spdlog.sh 1.9.2 +./tools/scripts/lib_install/zstandard.sh 1.4.9 +``` + +### Build + +* Configure the cmake project: + ```shell + mkdir build + cd build + cmake ../ + ``` + +* Build: + ```shell + make + ``` + +## Running + +* CLP contains two executables: `clp` and `clg` + * `clp` is used for compressing and extracting logs + * `clg` is used for performing wildcard searches on the compressed logs + +### `clp` + +To compress some logs: +```shell +./clp c archives-dir /home/my/logs +``` +* `archives-dir` is where compressed logs should be output + * `clp` will create a number of files and directories within, so it's best if this directory is empty + * You can use the same directory repeatedly and `clp` will add to the compressed logs within. +* `/home/my/logs` is any log file or directory containing log files + +To decompress those logs: +```shell +./clp x archive-dir decompressed +``` +* `archives-dir` is where the compressed logs were previously stored +* `decompressed` is a directory where they will be decompressed to + +You can also decompress a specific file: +```shell +./clp x archive-dir decompressed /my/file/path.log +``` +* `/my/file/path.log` is the uncompressed file's path (the one that was passed to `clp` for compression) + +More usage instructions can be found by running: +```shell +./clp --help +``` + +### `clg` + +To search the compressed logs: +```shell +./clg archives-dir " a *wildcard* search phrase " +``` +* `archives-dir` is where the compressed logs were previously stored +* The search phrase can contain the `*` wildcard which matches 0 or more characters, or the `?` wildcard which matches any single character. + +Similar to `clp`, `clg` can search a single file: +```shell +./clg archives-dir " a *wildcard* search phrase " /my/file/path.log +``` +* `/my/file/path.log` is the uncompressed file's path (the one that was passed to `clp` for compression) + +More usage instructions can be found by running: +```shell +./clg --help +``` + +## Parallel Compression + +By default, `clp` uses an embedded SQLite database, so each directory containing archives can only +be accessed by a single `clp` instance. + +To enable parallel compression to the same archives directory, `clp`/`clg` can be configured to +use a MySQL-type database (MariaDB) as follows: + +* Install and configure MariaDB using the instructions for your platform +* Create a user that has privileges to create databases, create tables, insert records, and delete + records. +* Copy and change `config/metadata-db.yml`, setting the type to `mysql` and uncommenting the MySQL + parameters. +* Install the MariaDB and PyYAML Python packages `pip3 install mariadb PyYAML` + * This is necessary to run the database initialization script. If you prefer, you can run the + SQL statements in `tools/scripts/db/init-db.py` directly. +* Run `tools/scripts/db/init-db.py` with the updated config file. This will initialize the + database CLP requires. +* Run `clp` or `clg` as before, with the addition of the `--db-config-file` option pointing at + the updated config file. +* To compress in parallel, simply run another instance of `clp` concurrently. + +Note that currently, decompression (`clp x`) and search (`clg`) can only be run with a single +instance. We are in the process of open-sourcing parallelizable versions of these as well. diff --git a/cmake/Modules/FindLZ4.cmake b/components/core/cmake/Modules/FindLZ4.cmake similarity index 100% rename from cmake/Modules/FindLZ4.cmake rename to components/core/cmake/Modules/FindLZ4.cmake diff --git a/cmake/Modules/FindLibArchive.cmake b/components/core/cmake/Modules/FindLibArchive.cmake similarity index 100% rename from cmake/Modules/FindLibArchive.cmake rename to components/core/cmake/Modules/FindLibArchive.cmake diff --git a/cmake/Modules/FindLibraryDependencies.cmake b/components/core/cmake/Modules/FindLibraryDependencies.cmake similarity index 100% rename from cmake/Modules/FindLibraryDependencies.cmake rename to components/core/cmake/Modules/FindLibraryDependencies.cmake diff --git a/cmake/Modules/FindMariaDBClient.cmake b/components/core/cmake/Modules/FindMariaDBClient.cmake similarity index 100% rename from cmake/Modules/FindMariaDBClient.cmake rename to components/core/cmake/Modules/FindMariaDBClient.cmake diff --git a/cmake/Modules/FindOpenSSL.cmake b/components/core/cmake/Modules/FindOpenSSL.cmake similarity index 100% rename from cmake/Modules/FindOpenSSL.cmake rename to components/core/cmake/Modules/FindOpenSSL.cmake diff --git a/cmake/Modules/FindZStd.cmake b/components/core/cmake/Modules/FindZStd.cmake similarity index 100% rename from cmake/Modules/FindZStd.cmake rename to components/core/cmake/Modules/FindZStd.cmake diff --git a/cmake/Modules/LibFindMacros.cmake b/components/core/cmake/Modules/LibFindMacros.cmake similarity index 100% rename from cmake/Modules/LibFindMacros.cmake rename to components/core/cmake/Modules/LibFindMacros.cmake diff --git a/config/metadata-db.yml b/components/core/config/metadata-db.yml similarity index 100% rename from config/metadata-db.yml rename to components/core/config/metadata-db.yml diff --git a/src/CommandLineArgumentsBase.cpp b/components/core/src/CommandLineArgumentsBase.cpp similarity index 100% rename from src/CommandLineArgumentsBase.cpp rename to components/core/src/CommandLineArgumentsBase.cpp diff --git a/src/CommandLineArgumentsBase.hpp b/components/core/src/CommandLineArgumentsBase.hpp similarity index 100% rename from src/CommandLineArgumentsBase.hpp rename to components/core/src/CommandLineArgumentsBase.hpp diff --git a/src/Defs.h b/components/core/src/Defs.h similarity index 100% rename from src/Defs.h rename to components/core/src/Defs.h diff --git a/src/DictionaryEntry.cpp b/components/core/src/DictionaryEntry.cpp similarity index 100% rename from src/DictionaryEntry.cpp rename to components/core/src/DictionaryEntry.cpp diff --git a/src/DictionaryEntry.hpp b/components/core/src/DictionaryEntry.hpp similarity index 100% rename from src/DictionaryEntry.hpp rename to components/core/src/DictionaryEntry.hpp diff --git a/src/DictionaryReader.cpp b/components/core/src/DictionaryReader.cpp similarity index 100% rename from src/DictionaryReader.cpp rename to components/core/src/DictionaryReader.cpp diff --git a/src/DictionaryReader.hpp b/components/core/src/DictionaryReader.hpp similarity index 100% rename from src/DictionaryReader.hpp rename to components/core/src/DictionaryReader.hpp diff --git a/src/DictionaryWriter.cpp b/components/core/src/DictionaryWriter.cpp similarity index 100% rename from src/DictionaryWriter.cpp rename to components/core/src/DictionaryWriter.cpp diff --git a/src/DictionaryWriter.hpp b/components/core/src/DictionaryWriter.hpp similarity index 100% rename from src/DictionaryWriter.hpp rename to components/core/src/DictionaryWriter.hpp diff --git a/src/EncodedVariableInterpreter.cpp b/components/core/src/EncodedVariableInterpreter.cpp similarity index 100% rename from src/EncodedVariableInterpreter.cpp rename to components/core/src/EncodedVariableInterpreter.cpp diff --git a/src/EncodedVariableInterpreter.hpp b/components/core/src/EncodedVariableInterpreter.hpp similarity index 100% rename from src/EncodedVariableInterpreter.hpp rename to components/core/src/EncodedVariableInterpreter.hpp diff --git a/src/ErrorCode.hpp b/components/core/src/ErrorCode.hpp similarity index 100% rename from src/ErrorCode.hpp rename to components/core/src/ErrorCode.hpp diff --git a/src/FileReader.cpp b/components/core/src/FileReader.cpp similarity index 100% rename from src/FileReader.cpp rename to components/core/src/FileReader.cpp diff --git a/src/FileReader.hpp b/components/core/src/FileReader.hpp similarity index 100% rename from src/FileReader.hpp rename to components/core/src/FileReader.hpp diff --git a/src/FileWriter.cpp b/components/core/src/FileWriter.cpp similarity index 100% rename from src/FileWriter.cpp rename to components/core/src/FileWriter.cpp diff --git a/src/FileWriter.hpp b/components/core/src/FileWriter.hpp similarity index 100% rename from src/FileWriter.hpp rename to components/core/src/FileWriter.hpp diff --git a/src/GlobalMetadataDB.cpp b/components/core/src/GlobalMetadataDB.cpp similarity index 100% rename from src/GlobalMetadataDB.cpp rename to components/core/src/GlobalMetadataDB.cpp diff --git a/src/GlobalMetadataDB.hpp b/components/core/src/GlobalMetadataDB.hpp similarity index 100% rename from src/GlobalMetadataDB.hpp rename to components/core/src/GlobalMetadataDB.hpp diff --git a/src/GlobalMetadataDBConfig.cpp b/components/core/src/GlobalMetadataDBConfig.cpp similarity index 100% rename from src/GlobalMetadataDBConfig.cpp rename to components/core/src/GlobalMetadataDBConfig.cpp diff --git a/src/GlobalMetadataDBConfig.hpp b/components/core/src/GlobalMetadataDBConfig.hpp similarity index 100% rename from src/GlobalMetadataDBConfig.hpp rename to components/core/src/GlobalMetadataDBConfig.hpp diff --git a/src/GlobalMySQLMetadataDB.cpp b/components/core/src/GlobalMySQLMetadataDB.cpp similarity index 100% rename from src/GlobalMySQLMetadataDB.cpp rename to components/core/src/GlobalMySQLMetadataDB.cpp diff --git a/src/GlobalMySQLMetadataDB.hpp b/components/core/src/GlobalMySQLMetadataDB.hpp similarity index 100% rename from src/GlobalMySQLMetadataDB.hpp rename to components/core/src/GlobalMySQLMetadataDB.hpp diff --git a/src/GlobalSQLiteMetadataDB.cpp b/components/core/src/GlobalSQLiteMetadataDB.cpp similarity index 100% rename from src/GlobalSQLiteMetadataDB.cpp rename to components/core/src/GlobalSQLiteMetadataDB.cpp diff --git a/src/GlobalSQLiteMetadataDB.hpp b/components/core/src/GlobalSQLiteMetadataDB.hpp similarity index 100% rename from src/GlobalSQLiteMetadataDB.hpp rename to components/core/src/GlobalSQLiteMetadataDB.hpp diff --git a/src/Grep.cpp b/components/core/src/Grep.cpp similarity index 100% rename from src/Grep.cpp rename to components/core/src/Grep.cpp diff --git a/src/Grep.hpp b/components/core/src/Grep.hpp similarity index 100% rename from src/Grep.hpp rename to components/core/src/Grep.hpp diff --git a/src/LibarchiveFileReader.cpp b/components/core/src/LibarchiveFileReader.cpp similarity index 100% rename from src/LibarchiveFileReader.cpp rename to components/core/src/LibarchiveFileReader.cpp diff --git a/src/LibarchiveFileReader.hpp b/components/core/src/LibarchiveFileReader.hpp similarity index 100% rename from src/LibarchiveFileReader.hpp rename to components/core/src/LibarchiveFileReader.hpp diff --git a/src/LibarchiveReader.cpp b/components/core/src/LibarchiveReader.cpp similarity index 100% rename from src/LibarchiveReader.cpp rename to components/core/src/LibarchiveReader.cpp diff --git a/src/LibarchiveReader.hpp b/components/core/src/LibarchiveReader.hpp similarity index 100% rename from src/LibarchiveReader.hpp rename to components/core/src/LibarchiveReader.hpp diff --git a/src/LogTypeDictionaryEntry.cpp b/components/core/src/LogTypeDictionaryEntry.cpp similarity index 100% rename from src/LogTypeDictionaryEntry.cpp rename to components/core/src/LogTypeDictionaryEntry.cpp diff --git a/src/LogTypeDictionaryEntry.hpp b/components/core/src/LogTypeDictionaryEntry.hpp similarity index 100% rename from src/LogTypeDictionaryEntry.hpp rename to components/core/src/LogTypeDictionaryEntry.hpp diff --git a/src/LogTypeDictionaryReader.cpp b/components/core/src/LogTypeDictionaryReader.cpp similarity index 100% rename from src/LogTypeDictionaryReader.cpp rename to components/core/src/LogTypeDictionaryReader.cpp diff --git a/src/LogTypeDictionaryReader.hpp b/components/core/src/LogTypeDictionaryReader.hpp similarity index 100% rename from src/LogTypeDictionaryReader.hpp rename to components/core/src/LogTypeDictionaryReader.hpp diff --git a/src/LogTypeDictionaryWriter.cpp b/components/core/src/LogTypeDictionaryWriter.cpp similarity index 100% rename from src/LogTypeDictionaryWriter.cpp rename to components/core/src/LogTypeDictionaryWriter.cpp diff --git a/src/LogTypeDictionaryWriter.hpp b/components/core/src/LogTypeDictionaryWriter.hpp similarity index 100% rename from src/LogTypeDictionaryWriter.hpp rename to components/core/src/LogTypeDictionaryWriter.hpp diff --git a/src/MessageParser.cpp b/components/core/src/MessageParser.cpp similarity index 100% rename from src/MessageParser.cpp rename to components/core/src/MessageParser.cpp diff --git a/src/MessageParser.hpp b/components/core/src/MessageParser.hpp similarity index 100% rename from src/MessageParser.hpp rename to components/core/src/MessageParser.hpp diff --git a/src/MySQLDB.cpp b/components/core/src/MySQLDB.cpp similarity index 100% rename from src/MySQLDB.cpp rename to components/core/src/MySQLDB.cpp diff --git a/src/MySQLDB.hpp b/components/core/src/MySQLDB.hpp similarity index 100% rename from src/MySQLDB.hpp rename to components/core/src/MySQLDB.hpp diff --git a/src/MySQLParamBindings.cpp b/components/core/src/MySQLParamBindings.cpp similarity index 100% rename from src/MySQLParamBindings.cpp rename to components/core/src/MySQLParamBindings.cpp diff --git a/src/MySQLParamBindings.hpp b/components/core/src/MySQLParamBindings.hpp similarity index 100% rename from src/MySQLParamBindings.hpp rename to components/core/src/MySQLParamBindings.hpp diff --git a/src/MySQLPreparedStatement.cpp b/components/core/src/MySQLPreparedStatement.cpp similarity index 100% rename from src/MySQLPreparedStatement.cpp rename to components/core/src/MySQLPreparedStatement.cpp diff --git a/src/MySQLPreparedStatement.hpp b/components/core/src/MySQLPreparedStatement.hpp similarity index 100% rename from src/MySQLPreparedStatement.hpp rename to components/core/src/MySQLPreparedStatement.hpp diff --git a/src/PageAllocatedVector.cpp b/components/core/src/PageAllocatedVector.cpp similarity index 100% rename from src/PageAllocatedVector.cpp rename to components/core/src/PageAllocatedVector.cpp diff --git a/src/PageAllocatedVector.hpp b/components/core/src/PageAllocatedVector.hpp similarity index 100% rename from src/PageAllocatedVector.hpp rename to components/core/src/PageAllocatedVector.hpp diff --git a/src/ParsedMessage.cpp b/components/core/src/ParsedMessage.cpp similarity index 100% rename from src/ParsedMessage.cpp rename to components/core/src/ParsedMessage.cpp diff --git a/src/ParsedMessage.hpp b/components/core/src/ParsedMessage.hpp similarity index 100% rename from src/ParsedMessage.hpp rename to components/core/src/ParsedMessage.hpp diff --git a/src/Profiler.cpp b/components/core/src/Profiler.cpp similarity index 100% rename from src/Profiler.cpp rename to components/core/src/Profiler.cpp diff --git a/src/Profiler.hpp b/components/core/src/Profiler.hpp similarity index 100% rename from src/Profiler.hpp rename to components/core/src/Profiler.hpp diff --git a/src/Query.cpp b/components/core/src/Query.cpp similarity index 100% rename from src/Query.cpp rename to components/core/src/Query.cpp diff --git a/src/Query.hpp b/components/core/src/Query.hpp similarity index 100% rename from src/Query.hpp rename to components/core/src/Query.hpp diff --git a/src/ReaderInterface.cpp b/components/core/src/ReaderInterface.cpp similarity index 100% rename from src/ReaderInterface.cpp rename to components/core/src/ReaderInterface.cpp diff --git a/src/ReaderInterface.hpp b/components/core/src/ReaderInterface.hpp similarity index 100% rename from src/ReaderInterface.hpp rename to components/core/src/ReaderInterface.hpp diff --git a/src/SQLiteDB.cpp b/components/core/src/SQLiteDB.cpp similarity index 100% rename from src/SQLiteDB.cpp rename to components/core/src/SQLiteDB.cpp diff --git a/src/SQLiteDB.hpp b/components/core/src/SQLiteDB.hpp similarity index 100% rename from src/SQLiteDB.hpp rename to components/core/src/SQLiteDB.hpp diff --git a/src/SQLitePreparedStatement.cpp b/components/core/src/SQLitePreparedStatement.cpp similarity index 100% rename from src/SQLitePreparedStatement.cpp rename to components/core/src/SQLitePreparedStatement.cpp diff --git a/src/SQLitePreparedStatement.hpp b/components/core/src/SQLitePreparedStatement.hpp similarity index 100% rename from src/SQLitePreparedStatement.hpp rename to components/core/src/SQLitePreparedStatement.hpp diff --git a/src/Stopwatch.cpp b/components/core/src/Stopwatch.cpp similarity index 100% rename from src/Stopwatch.cpp rename to components/core/src/Stopwatch.cpp diff --git a/src/Stopwatch.hpp b/components/core/src/Stopwatch.hpp similarity index 100% rename from src/Stopwatch.hpp rename to components/core/src/Stopwatch.hpp diff --git a/src/TimestampPattern.cpp b/components/core/src/TimestampPattern.cpp similarity index 100% rename from src/TimestampPattern.cpp rename to components/core/src/TimestampPattern.cpp diff --git a/src/TimestampPattern.hpp b/components/core/src/TimestampPattern.hpp similarity index 100% rename from src/TimestampPattern.hpp rename to components/core/src/TimestampPattern.hpp diff --git a/src/TraceableException.cpp b/components/core/src/TraceableException.cpp similarity index 100% rename from src/TraceableException.cpp rename to components/core/src/TraceableException.cpp diff --git a/src/TraceableException.hpp b/components/core/src/TraceableException.hpp similarity index 100% rename from src/TraceableException.hpp rename to components/core/src/TraceableException.hpp diff --git a/src/Utils.cpp b/components/core/src/Utils.cpp similarity index 100% rename from src/Utils.cpp rename to components/core/src/Utils.cpp diff --git a/src/Utils.hpp b/components/core/src/Utils.hpp similarity index 100% rename from src/Utils.hpp rename to components/core/src/Utils.hpp diff --git a/src/VariableDictionaryEntry.cpp b/components/core/src/VariableDictionaryEntry.cpp similarity index 100% rename from src/VariableDictionaryEntry.cpp rename to components/core/src/VariableDictionaryEntry.cpp diff --git a/src/VariableDictionaryEntry.hpp b/components/core/src/VariableDictionaryEntry.hpp similarity index 100% rename from src/VariableDictionaryEntry.hpp rename to components/core/src/VariableDictionaryEntry.hpp diff --git a/src/VariableDictionaryReader.cpp b/components/core/src/VariableDictionaryReader.cpp similarity index 100% rename from src/VariableDictionaryReader.cpp rename to components/core/src/VariableDictionaryReader.cpp diff --git a/src/VariableDictionaryReader.hpp b/components/core/src/VariableDictionaryReader.hpp similarity index 100% rename from src/VariableDictionaryReader.hpp rename to components/core/src/VariableDictionaryReader.hpp diff --git a/src/VariableDictionaryWriter.cpp b/components/core/src/VariableDictionaryWriter.cpp similarity index 100% rename from src/VariableDictionaryWriter.cpp rename to components/core/src/VariableDictionaryWriter.cpp diff --git a/src/VariableDictionaryWriter.hpp b/components/core/src/VariableDictionaryWriter.hpp similarity index 100% rename from src/VariableDictionaryWriter.hpp rename to components/core/src/VariableDictionaryWriter.hpp diff --git a/src/Writer.cpp b/components/core/src/Writer.cpp similarity index 100% rename from src/Writer.cpp rename to components/core/src/Writer.cpp diff --git a/src/Writer.hpp b/components/core/src/Writer.hpp similarity index 100% rename from src/Writer.hpp rename to components/core/src/Writer.hpp diff --git a/src/WriterInterface.cpp b/components/core/src/WriterInterface.cpp similarity index 100% rename from src/WriterInterface.cpp rename to components/core/src/WriterInterface.cpp diff --git a/src/WriterInterface.hpp b/components/core/src/WriterInterface.hpp similarity index 100% rename from src/WriterInterface.hpp rename to components/core/src/WriterInterface.hpp diff --git a/src/clg/CommandLineArguments.cpp b/components/core/src/clg/CommandLineArguments.cpp similarity index 100% rename from src/clg/CommandLineArguments.cpp rename to components/core/src/clg/CommandLineArguments.cpp diff --git a/src/clg/CommandLineArguments.hpp b/components/core/src/clg/CommandLineArguments.hpp similarity index 100% rename from src/clg/CommandLineArguments.hpp rename to components/core/src/clg/CommandLineArguments.hpp diff --git a/src/clg/clg.cpp b/components/core/src/clg/clg.cpp similarity index 100% rename from src/clg/clg.cpp rename to components/core/src/clg/clg.cpp diff --git a/src/clp/CommandLineArguments.cpp b/components/core/src/clp/CommandLineArguments.cpp similarity index 100% rename from src/clp/CommandLineArguments.cpp rename to components/core/src/clp/CommandLineArguments.cpp diff --git a/src/clp/CommandLineArguments.hpp b/components/core/src/clp/CommandLineArguments.hpp similarity index 100% rename from src/clp/CommandLineArguments.hpp rename to components/core/src/clp/CommandLineArguments.hpp diff --git a/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp similarity index 100% rename from src/clp/FileCompressor.cpp rename to components/core/src/clp/FileCompressor.cpp diff --git a/src/clp/FileCompressor.hpp b/components/core/src/clp/FileCompressor.hpp similarity index 100% rename from src/clp/FileCompressor.hpp rename to components/core/src/clp/FileCompressor.hpp diff --git a/src/clp/FileDecompressor.cpp b/components/core/src/clp/FileDecompressor.cpp similarity index 100% rename from src/clp/FileDecompressor.cpp rename to components/core/src/clp/FileDecompressor.cpp diff --git a/src/clp/FileDecompressor.hpp b/components/core/src/clp/FileDecompressor.hpp similarity index 100% rename from src/clp/FileDecompressor.hpp rename to components/core/src/clp/FileDecompressor.hpp diff --git a/src/clp/FileToCompress.cpp b/components/core/src/clp/FileToCompress.cpp similarity index 100% rename from src/clp/FileToCompress.cpp rename to components/core/src/clp/FileToCompress.cpp diff --git a/src/clp/FileToCompress.hpp b/components/core/src/clp/FileToCompress.hpp similarity index 100% rename from src/clp/FileToCompress.hpp rename to components/core/src/clp/FileToCompress.hpp diff --git a/src/clp/StructuredFileToCompress.cpp b/components/core/src/clp/StructuredFileToCompress.cpp similarity index 100% rename from src/clp/StructuredFileToCompress.cpp rename to components/core/src/clp/StructuredFileToCompress.cpp diff --git a/src/clp/StructuredFileToCompress.hpp b/components/core/src/clp/StructuredFileToCompress.hpp similarity index 100% rename from src/clp/StructuredFileToCompress.hpp rename to components/core/src/clp/StructuredFileToCompress.hpp diff --git a/src/clp/clp.cpp b/components/core/src/clp/clp.cpp similarity index 100% rename from src/clp/clp.cpp rename to components/core/src/clp/clp.cpp diff --git a/src/clp/compression.cpp b/components/core/src/clp/compression.cpp similarity index 100% rename from src/clp/compression.cpp rename to components/core/src/clp/compression.cpp diff --git a/src/clp/compression.hpp b/components/core/src/clp/compression.hpp similarity index 100% rename from src/clp/compression.hpp rename to components/core/src/clp/compression.hpp diff --git a/src/clp/decompression.cpp b/components/core/src/clp/decompression.cpp similarity index 100% rename from src/clp/decompression.cpp rename to components/core/src/clp/decompression.cpp diff --git a/src/clp/decompression.hpp b/components/core/src/clp/decompression.hpp similarity index 100% rename from src/clp/decompression.hpp rename to components/core/src/clp/decompression.hpp diff --git a/src/clp/utils.cpp b/components/core/src/clp/utils.cpp similarity index 100% rename from src/clp/utils.cpp rename to components/core/src/clp/utils.cpp diff --git a/src/clp/utils.hpp b/components/core/src/clp/utils.hpp similarity index 100% rename from src/clp/utils.hpp rename to components/core/src/clp/utils.hpp diff --git a/src/database_utils.cpp b/components/core/src/database_utils.cpp similarity index 100% rename from src/database_utils.cpp rename to components/core/src/database_utils.cpp diff --git a/src/database_utils.hpp b/components/core/src/database_utils.hpp similarity index 100% rename from src/database_utils.hpp rename to components/core/src/database_utils.hpp diff --git a/src/dictionary_utils.cpp b/components/core/src/dictionary_utils.cpp similarity index 100% rename from src/dictionary_utils.cpp rename to components/core/src/dictionary_utils.cpp diff --git a/src/dictionary_utils.hpp b/components/core/src/dictionary_utils.hpp similarity index 100% rename from src/dictionary_utils.hpp rename to components/core/src/dictionary_utils.hpp diff --git a/src/streaming_archive/Constants.hpp b/components/core/src/streaming_archive/Constants.hpp similarity index 100% rename from src/streaming_archive/Constants.hpp rename to components/core/src/streaming_archive/Constants.hpp diff --git a/src/streaming_archive/MetadataDB.cpp b/components/core/src/streaming_archive/MetadataDB.cpp similarity index 100% rename from src/streaming_archive/MetadataDB.cpp rename to components/core/src/streaming_archive/MetadataDB.cpp diff --git a/src/streaming_archive/MetadataDB.hpp b/components/core/src/streaming_archive/MetadataDB.hpp similarity index 100% rename from src/streaming_archive/MetadataDB.hpp rename to components/core/src/streaming_archive/MetadataDB.hpp diff --git a/src/streaming_archive/reader/Archive.cpp b/components/core/src/streaming_archive/reader/Archive.cpp similarity index 100% rename from src/streaming_archive/reader/Archive.cpp rename to components/core/src/streaming_archive/reader/Archive.cpp diff --git a/src/streaming_archive/reader/Archive.hpp b/components/core/src/streaming_archive/reader/Archive.hpp similarity index 100% rename from src/streaming_archive/reader/Archive.hpp rename to components/core/src/streaming_archive/reader/Archive.hpp diff --git a/src/streaming_archive/reader/File.cpp b/components/core/src/streaming_archive/reader/File.cpp similarity index 100% rename from src/streaming_archive/reader/File.cpp rename to components/core/src/streaming_archive/reader/File.cpp diff --git a/src/streaming_archive/reader/File.hpp b/components/core/src/streaming_archive/reader/File.hpp similarity index 100% rename from src/streaming_archive/reader/File.hpp rename to components/core/src/streaming_archive/reader/File.hpp diff --git a/src/streaming_archive/reader/Message.cpp b/components/core/src/streaming_archive/reader/Message.cpp similarity index 100% rename from src/streaming_archive/reader/Message.cpp rename to components/core/src/streaming_archive/reader/Message.cpp diff --git a/src/streaming_archive/reader/Message.hpp b/components/core/src/streaming_archive/reader/Message.hpp similarity index 100% rename from src/streaming_archive/reader/Message.hpp rename to components/core/src/streaming_archive/reader/Message.hpp diff --git a/src/streaming_archive/reader/Segment.cpp b/components/core/src/streaming_archive/reader/Segment.cpp similarity index 100% rename from src/streaming_archive/reader/Segment.cpp rename to components/core/src/streaming_archive/reader/Segment.cpp diff --git a/src/streaming_archive/reader/Segment.hpp b/components/core/src/streaming_archive/reader/Segment.hpp similarity index 100% rename from src/streaming_archive/reader/Segment.hpp rename to components/core/src/streaming_archive/reader/Segment.hpp diff --git a/src/streaming_archive/reader/SegmentManager.cpp b/components/core/src/streaming_archive/reader/SegmentManager.cpp similarity index 100% rename from src/streaming_archive/reader/SegmentManager.cpp rename to components/core/src/streaming_archive/reader/SegmentManager.cpp diff --git a/src/streaming_archive/reader/SegmentManager.hpp b/components/core/src/streaming_archive/reader/SegmentManager.hpp similarity index 100% rename from src/streaming_archive/reader/SegmentManager.hpp rename to components/core/src/streaming_archive/reader/SegmentManager.hpp diff --git a/src/streaming_archive/writer/Archive.cpp b/components/core/src/streaming_archive/writer/Archive.cpp similarity index 100% rename from src/streaming_archive/writer/Archive.cpp rename to components/core/src/streaming_archive/writer/Archive.cpp diff --git a/src/streaming_archive/writer/Archive.hpp b/components/core/src/streaming_archive/writer/Archive.hpp similarity index 100% rename from src/streaming_archive/writer/Archive.hpp rename to components/core/src/streaming_archive/writer/Archive.hpp diff --git a/src/streaming_archive/writer/File.cpp b/components/core/src/streaming_archive/writer/File.cpp similarity index 100% rename from src/streaming_archive/writer/File.cpp rename to components/core/src/streaming_archive/writer/File.cpp diff --git a/src/streaming_archive/writer/File.hpp b/components/core/src/streaming_archive/writer/File.hpp similarity index 100% rename from src/streaming_archive/writer/File.hpp rename to components/core/src/streaming_archive/writer/File.hpp diff --git a/src/streaming_archive/writer/InMemoryFile.cpp b/components/core/src/streaming_archive/writer/InMemoryFile.cpp similarity index 100% rename from src/streaming_archive/writer/InMemoryFile.cpp rename to components/core/src/streaming_archive/writer/InMemoryFile.cpp diff --git a/src/streaming_archive/writer/InMemoryFile.hpp b/components/core/src/streaming_archive/writer/InMemoryFile.hpp similarity index 100% rename from src/streaming_archive/writer/InMemoryFile.hpp rename to components/core/src/streaming_archive/writer/InMemoryFile.hpp diff --git a/src/streaming_archive/writer/OnDiskFile.cpp b/components/core/src/streaming_archive/writer/OnDiskFile.cpp similarity index 100% rename from src/streaming_archive/writer/OnDiskFile.cpp rename to components/core/src/streaming_archive/writer/OnDiskFile.cpp diff --git a/src/streaming_archive/writer/OnDiskFile.hpp b/components/core/src/streaming_archive/writer/OnDiskFile.hpp similarity index 100% rename from src/streaming_archive/writer/OnDiskFile.hpp rename to components/core/src/streaming_archive/writer/OnDiskFile.hpp diff --git a/src/streaming_archive/writer/Segment.cpp b/components/core/src/streaming_archive/writer/Segment.cpp similarity index 100% rename from src/streaming_archive/writer/Segment.cpp rename to components/core/src/streaming_archive/writer/Segment.cpp diff --git a/src/streaming_archive/writer/Segment.hpp b/components/core/src/streaming_archive/writer/Segment.hpp similarity index 100% rename from src/streaming_archive/writer/Segment.hpp rename to components/core/src/streaming_archive/writer/Segment.hpp diff --git a/src/streaming_compression/Compressor.cpp b/components/core/src/streaming_compression/Compressor.cpp similarity index 100% rename from src/streaming_compression/Compressor.cpp rename to components/core/src/streaming_compression/Compressor.cpp diff --git a/src/streaming_compression/Compressor.hpp b/components/core/src/streaming_compression/Compressor.hpp similarity index 100% rename from src/streaming_compression/Compressor.hpp rename to components/core/src/streaming_compression/Compressor.hpp diff --git a/src/streaming_compression/Constants.hpp b/components/core/src/streaming_compression/Constants.hpp similarity index 100% rename from src/streaming_compression/Constants.hpp rename to components/core/src/streaming_compression/Constants.hpp diff --git a/src/streaming_compression/Decompressor.cpp b/components/core/src/streaming_compression/Decompressor.cpp similarity index 100% rename from src/streaming_compression/Decompressor.cpp rename to components/core/src/streaming_compression/Decompressor.cpp diff --git a/src/streaming_compression/Decompressor.hpp b/components/core/src/streaming_compression/Decompressor.hpp similarity index 100% rename from src/streaming_compression/Decompressor.hpp rename to components/core/src/streaming_compression/Decompressor.hpp diff --git a/src/streaming_compression/passthrough/Compressor.cpp b/components/core/src/streaming_compression/passthrough/Compressor.cpp similarity index 100% rename from src/streaming_compression/passthrough/Compressor.cpp rename to components/core/src/streaming_compression/passthrough/Compressor.cpp diff --git a/src/streaming_compression/passthrough/Compressor.hpp b/components/core/src/streaming_compression/passthrough/Compressor.hpp similarity index 100% rename from src/streaming_compression/passthrough/Compressor.hpp rename to components/core/src/streaming_compression/passthrough/Compressor.hpp diff --git a/src/streaming_compression/passthrough/Decompressor.cpp b/components/core/src/streaming_compression/passthrough/Decompressor.cpp similarity index 100% rename from src/streaming_compression/passthrough/Decompressor.cpp rename to components/core/src/streaming_compression/passthrough/Decompressor.cpp diff --git a/src/streaming_compression/passthrough/Decompressor.hpp b/components/core/src/streaming_compression/passthrough/Decompressor.hpp similarity index 100% rename from src/streaming_compression/passthrough/Decompressor.hpp rename to components/core/src/streaming_compression/passthrough/Decompressor.hpp diff --git a/src/streaming_compression/zstd/Compressor.cpp b/components/core/src/streaming_compression/zstd/Compressor.cpp similarity index 100% rename from src/streaming_compression/zstd/Compressor.cpp rename to components/core/src/streaming_compression/zstd/Compressor.cpp diff --git a/src/streaming_compression/zstd/Compressor.hpp b/components/core/src/streaming_compression/zstd/Compressor.hpp similarity index 100% rename from src/streaming_compression/zstd/Compressor.hpp rename to components/core/src/streaming_compression/zstd/Compressor.hpp diff --git a/src/streaming_compression/zstd/Constants.hpp b/components/core/src/streaming_compression/zstd/Constants.hpp similarity index 100% rename from src/streaming_compression/zstd/Constants.hpp rename to components/core/src/streaming_compression/zstd/Constants.hpp diff --git a/src/streaming_compression/zstd/Decompressor.cpp b/components/core/src/streaming_compression/zstd/Decompressor.cpp similarity index 100% rename from src/streaming_compression/zstd/Decompressor.cpp rename to components/core/src/streaming_compression/zstd/Decompressor.cpp diff --git a/src/streaming_compression/zstd/Decompressor.hpp b/components/core/src/streaming_compression/zstd/Decompressor.hpp similarity index 100% rename from src/streaming_compression/zstd/Decompressor.hpp rename to components/core/src/streaming_compression/zstd/Decompressor.hpp diff --git a/src/version.hpp b/components/core/src/version.hpp similarity index 100% rename from src/version.hpp rename to components/core/src/version.hpp diff --git a/submodules/Catch2 b/components/core/submodules/Catch2 similarity index 100% rename from submodules/Catch2 rename to components/core/submodules/Catch2 diff --git a/submodules/date b/components/core/submodules/date similarity index 100% rename from submodules/date rename to components/core/submodules/date diff --git a/submodules/json b/components/core/submodules/json similarity index 100% rename from submodules/json rename to components/core/submodules/json diff --git a/submodules/yaml-cpp b/components/core/submodules/yaml-cpp similarity index 100% rename from submodules/yaml-cpp rename to components/core/submodules/yaml-cpp diff --git a/tests/test-EncodedVariableInterpreter.cpp b/components/core/tests/test-EncodedVariableInterpreter.cpp similarity index 100% rename from tests/test-EncodedVariableInterpreter.cpp rename to components/core/tests/test-EncodedVariableInterpreter.cpp diff --git a/tests/test-Grep.cpp b/components/core/tests/test-Grep.cpp similarity index 100% rename from tests/test-Grep.cpp rename to components/core/tests/test-Grep.cpp diff --git a/tests/test-Segment.cpp b/components/core/tests/test-Segment.cpp similarity index 100% rename from tests/test-Segment.cpp rename to components/core/tests/test-Segment.cpp diff --git a/tests/test-Stopwatch.cpp b/components/core/tests/test-Stopwatch.cpp similarity index 100% rename from tests/test-Stopwatch.cpp rename to components/core/tests/test-Stopwatch.cpp diff --git a/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp similarity index 100% rename from tests/test-StreamingCompression.cpp rename to components/core/tests/test-StreamingCompression.cpp diff --git a/tests/test-TimestampPattern.cpp b/components/core/tests/test-TimestampPattern.cpp similarity index 100% rename from tests/test-TimestampPattern.cpp rename to components/core/tests/test-TimestampPattern.cpp diff --git a/tests/test-Utils.cpp b/components/core/tests/test-Utils.cpp similarity index 100% rename from tests/test-Utils.cpp rename to components/core/tests/test-Utils.cpp diff --git a/tests/test-main.cpp b/components/core/tests/test-main.cpp similarity index 100% rename from tests/test-main.cpp rename to components/core/tests/test-main.cpp diff --git a/tools/docker-images/clp-env-base-bionic/Dockerfile b/components/core/tools/docker-images/clp-env-base-bionic/Dockerfile similarity index 100% rename from tools/docker-images/clp-env-base-bionic/Dockerfile rename to components/core/tools/docker-images/clp-env-base-bionic/Dockerfile diff --git a/tools/docker-images/clp-env-base-bionic/build.sh b/components/core/tools/docker-images/clp-env-base-bionic/build.sh similarity index 100% rename from tools/docker-images/clp-env-base-bionic/build.sh rename to components/core/tools/docker-images/clp-env-base-bionic/build.sh diff --git a/tools/docker-images/clp-env-base-centos7.4/Dockerfile b/components/core/tools/docker-images/clp-env-base-centos7.4/Dockerfile similarity index 100% rename from tools/docker-images/clp-env-base-centos7.4/Dockerfile rename to components/core/tools/docker-images/clp-env-base-centos7.4/Dockerfile diff --git a/tools/docker-images/clp-env-base-centos7.4/build.sh b/components/core/tools/docker-images/clp-env-base-centos7.4/build.sh similarity index 100% rename from tools/docker-images/clp-env-base-centos7.4/build.sh rename to components/core/tools/docker-images/clp-env-base-centos7.4/build.sh diff --git a/tools/docker-images/clp-env-base-centos7.4/install-boost.sh b/components/core/tools/docker-images/clp-env-base-centos7.4/install-boost.sh similarity index 100% rename from tools/docker-images/clp-env-base-centos7.4/install-boost.sh rename to components/core/tools/docker-images/clp-env-base-centos7.4/install-boost.sh diff --git a/tools/docker-images/clp-env-base-centos7.4/install-cmake.sh b/components/core/tools/docker-images/clp-env-base-centos7.4/install-cmake.sh similarity index 100% rename from tools/docker-images/clp-env-base-centos7.4/install-cmake.sh rename to components/core/tools/docker-images/clp-env-base-centos7.4/install-cmake.sh diff --git a/tools/docker-images/clp-env-base-centos7.4/install-gcc.sh b/components/core/tools/docker-images/clp-env-base-centos7.4/install-gcc.sh similarity index 100% rename from tools/docker-images/clp-env-base-centos7.4/install-gcc.sh rename to components/core/tools/docker-images/clp-env-base-centos7.4/install-gcc.sh diff --git a/tools/docker-images/clp-env-base-focal/Dockerfile b/components/core/tools/docker-images/clp-env-base-focal/Dockerfile similarity index 100% rename from tools/docker-images/clp-env-base-focal/Dockerfile rename to components/core/tools/docker-images/clp-env-base-focal/Dockerfile diff --git a/tools/docker-images/clp-env-base-focal/build.sh b/components/core/tools/docker-images/clp-env-base-focal/build.sh similarity index 100% rename from tools/docker-images/clp-env-base-focal/build.sh rename to components/core/tools/docker-images/clp-env-base-focal/build.sh diff --git a/tools/scripts/db/init-db.py b/components/core/tools/scripts/db/init-db.py similarity index 100% rename from tools/scripts/db/init-db.py rename to components/core/tools/scripts/db/init-db.py diff --git a/tools/scripts/deps-download/download-all.sh b/components/core/tools/scripts/deps-download/download-all.sh similarity index 82% rename from tools/scripts/deps-download/download-all.sh rename to components/core/tools/scripts/deps-download/download-all.sh index 1b404f819..67dc1d895 100755 --- a/tools/scripts/deps-download/download-all.sh +++ b/components/core/tools/scripts/deps-download/download-all.sh @@ -1,7 +1,7 @@ #!/bin/bash script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -project_root_dir=${script_dir}/../../../ +project_root_dir=${script_dir}/../../../../../ cd ${project_root_dir} git submodule update --init --recursive diff --git a/tools/scripts/deps-download/download-dep.py b/components/core/tools/scripts/deps-download/download-dep.py similarity index 100% rename from tools/scripts/deps-download/download-dep.py rename to components/core/tools/scripts/deps-download/download-dep.py diff --git a/tools/scripts/deps-download/sqlite3.json b/components/core/tools/scripts/deps-download/sqlite3.json similarity index 100% rename from tools/scripts/deps-download/sqlite3.json rename to components/core/tools/scripts/deps-download/sqlite3.json diff --git a/tools/scripts/lib_install/fmtlib.sh b/components/core/tools/scripts/lib_install/fmtlib.sh similarity index 100% rename from tools/scripts/lib_install/fmtlib.sh rename to components/core/tools/scripts/lib_install/fmtlib.sh diff --git a/tools/scripts/lib_install/libarchive.sh b/components/core/tools/scripts/lib_install/libarchive.sh similarity index 100% rename from tools/scripts/lib_install/libarchive.sh rename to components/core/tools/scripts/lib_install/libarchive.sh diff --git a/tools/scripts/lib_install/lz4.sh b/components/core/tools/scripts/lib_install/lz4.sh similarity index 100% rename from tools/scripts/lib_install/lz4.sh rename to components/core/tools/scripts/lib_install/lz4.sh diff --git a/tools/scripts/lib_install/mariadb-connector-c.sh b/components/core/tools/scripts/lib_install/mariadb-connector-c.sh similarity index 100% rename from tools/scripts/lib_install/mariadb-connector-c.sh rename to components/core/tools/scripts/lib_install/mariadb-connector-c.sh diff --git a/tools/scripts/lib_install/spdlog.sh b/components/core/tools/scripts/lib_install/spdlog.sh similarity index 100% rename from tools/scripts/lib_install/spdlog.sh rename to components/core/tools/scripts/lib_install/spdlog.sh diff --git a/tools/scripts/lib_install/zstandard.sh b/components/core/tools/scripts/lib_install/zstandard.sh similarity index 100% rename from tools/scripts/lib_install/zstandard.sh rename to components/core/tools/scripts/lib_install/zstandard.sh diff --git a/components/job-orchestration/LICENSE b/components/job-orchestration/LICENSE new file mode 100644 index 000000000..3340c889f --- /dev/null +++ b/components/job-orchestration/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021 YScope Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/components/job-orchestration/README.md b/components/job-orchestration/README.md new file mode 100644 index 000000000..ab24cfbc1 --- /dev/null +++ b/components/job-orchestration/README.md @@ -0,0 +1,33 @@ +# CLP Job Orchestration + +This Python module contains CLP's scheduler and worker to handle distributed compression. +CLP's Compression Job Handler can be used to interface and submit compression jobs to the CLP scheduler. + +## Installation + +```bash +pip3 install -r requirements.txt --target /lib/python3/site-packages +cp -R clp_py_utils /lib/python3/site-packages +``` + +## Usage + +### Running the `scheduler` + +```bash +PYTHONPATH= \ + BROKER_URL=amqp://:@: \ + python3 -m job_orchestration.scheduler.scheduler --config +``` + +### Running the `executor` + +```bash +PYTHONPATH= \ + CLP_HOME= \ + CLP_DATA_DIR= \ + CLP_LOGS_DIR= \ + BROKER_URL=amqp://:@: \ + RESULT_BACKEND=rpc://:@: \ + celery -A executor worker --loglevel INFO -Q compression +``` diff --git a/components/job-orchestration/job_orchestration/executor/__init__.py b/components/job-orchestration/job_orchestration/executor/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/components/job-orchestration/job_orchestration/executor/celery.py b/components/job-orchestration/job_orchestration/executor/celery.py new file mode 100644 index 000000000..769b8a2d3 --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/celery.py @@ -0,0 +1,9 @@ +from celery import Celery + +from . import celeryconfig + +app = Celery('clp_scheduler') +app.config_from_object(celeryconfig) + +if '__main__' == __name__: + app.start() diff --git a/components/job-orchestration/job_orchestration/executor/celeryconfig.py b/components/job-orchestration/job_orchestration/executor/celeryconfig.py new file mode 100644 index 000000000..768b5f4ec --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/celeryconfig.py @@ -0,0 +1,10 @@ +import os +result_persistent = True +worker_prefetch_multiplier = 1 +task_queue_max_priority = 3 +imports = 'job_orchestration.executor.compression.task' +task_routes = {'job_orchestration.executor.compression.task.compress': 'compression'} +task_create_missing_queues = True + +broker_url = os.getenv('BROKER_URL') +result_backend = os.getenv('RESULT_BACKEND') \ No newline at end of file diff --git a/components/job-orchestration/job_orchestration/executor/compression/__init__.py b/components/job-orchestration/job_orchestration/executor/compression/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/components/job-orchestration/job_orchestration/executor/compression/fs_to_fs_compress_method.py b/components/job-orchestration/job_orchestration/executor/compression/fs_to_fs_compress_method.py new file mode 100644 index 000000000..00e3d95ee --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/compression/fs_to_fs_compress_method.py @@ -0,0 +1,151 @@ +""" +This module is specifically to hold the remote method, easing the process of +figuring out what imports it requires. +""" +import json +import pathlib +import subprocess +import sys + +import celery.utils.nodenames +import yaml +from celery.utils.log import get_task_logger + +from clp_py_utils.clp_io_config import ClpIoConfig, PathsToCompress + + +def compress(clp_config: ClpIoConfig, clp_home_str: str, data_dir_str: str, logs_dir_str: str, + job_id_str: str, task_id_str: str, paths_to_compress: PathsToCompress, database_connection_params): + """ + Compresses files from an FS into archives on an FS + + :param clp_config: ClpIoConfig + :param clp_home_str: + :param data_dir_str: + :param logs_dir_str: + :param job_id_str: + :param task_id_str: + :param paths_to_compress: PathToCompress + :param database_connection_params: + :return: tuple -- (whether compression was successful, output messages) + """ + # Setup logging + logger = get_task_logger(__name__) + + instance_id_str = f'job-{job_id_str}-task-{task_id_str}' + + clp_home = pathlib.Path(clp_home_str) + + # Add clp package to sys.path + python_site_packages_path = clp_home / 'lib' / 'python3' / 'site-packages' + if not python_site_packages_path.is_dir(): + logger.error('Failed to load python3 packages bundled with CLP.') + return False, 0 + # Add packages to the front of the path + sys.path.insert(0, str(python_site_packages_path)) + + # Expand parameters + path_prefix_to_remove = clp_config.input.path_prefix_to_remove + + file_paths = paths_to_compress.file_paths + + data_dir = pathlib.Path(data_dir_str).resolve() + logs_dir = pathlib.Path(logs_dir_str).resolve() + + # Generate database config file for clp + db_config_file_path = data_dir / f'{instance_id_str}-db-config.yml' + db_config_file = open(db_config_file_path, 'w') + yaml.safe_dump(database_connection_params, db_config_file) + db_config_file.close() + + # Start assembling compression command + archives_dir = data_dir / 'archives' + compression_cmd = [ + str(clp_home / 'bin' / 'clp'), + 'c', str(archives_dir), + '--print-archive-stats-progress', + '--target-dictionaries-size', + str(clp_config.output.target_dictionaries_size), + '--target-segment-size', str(clp_config.output.target_segment_size), + '--target-encoded-file-size', str(clp_config.output.target_encoded_file_size), + '--storage-id', + '--db-config-file', str(db_config_file_path) + ] + if clp_config.output.storage_is_node_specific: + compression_cmd.append(celery.utils.nodenames.gethostname()) + else: + # Mark as globally-accessible + compression_cmd.append('*') + if path_prefix_to_remove: + compression_cmd.append('--remove-path-prefix') + compression_cmd.append(path_prefix_to_remove) + + # Prepare list of paths to compress for clp + log_list_path = data_dir / f'{instance_id_str}-log-paths.txt' + with open(log_list_path, 'w') as file: + if len(file_paths) > 0: + for path_str in file_paths: + file.write(path_str) + file.write('\n') + if paths_to_compress.empty_directories and len(paths_to_compress.empty_directories) > 0: + # Prepare list of paths to compress for clp + for path_str in paths_to_compress.empty_directories: + file.write(path_str) + file.write('\n') + + compression_cmd.append('--files-from') + compression_cmd.append(str(log_list_path)) + + # Open stderr log file + stderr_log_path = logs_dir / f'{instance_id_str}-stderr.log' + stderr_log_file = open(stderr_log_path, 'w') + + # Start compression + logger.debug('Compressing...') + compression_successful = False + proc = subprocess.Popen(compression_cmd, close_fds=True, stdout=subprocess.PIPE, + stderr=stderr_log_file) + + # Compute the total amount of data compressed + last_archive_stats = None + total_uncompressed_size = 0 + total_compressed_size = 0 + while True: + line = proc.stdout.readline() + if not line: + break + stats = json.loads(line.decode('ascii')) + if last_archive_stats is not None and stats['id'] != last_archive_stats['id']: + # We've started a new archive so add the previous archive's last + # reported size to the total + total_uncompressed_size += last_archive_stats['uncompressed_size'] + total_compressed_size += last_archive_stats['size'] + last_archive_stats = stats + if last_archive_stats is not None: + # Add the last archive's last reported size + total_uncompressed_size += last_archive_stats['uncompressed_size'] + total_compressed_size += last_archive_stats['size'] + + # Wait for compression to finish + return_code = proc.wait() + if 0 != return_code: + logger.error(f'Failed to compress, return_code={str(return_code)}') + else: + compression_successful = True + + # Remove generated temporary files + if log_list_path: + log_list_path.unlink() + db_config_file_path.unlink() + logger.debug('Compressed.') + + # Close stderr log file + stderr_log_file.close() + + if compression_successful: + return compression_successful, { + 'total_uncompressed_size': total_uncompressed_size, + 'total_compressed_size': total_compressed_size, + } + else: + return compression_successful, {'error_message': f'See logs {stderr_log_path}'} diff --git a/components/job-orchestration/job_orchestration/executor/compression/task.py b/components/job-orchestration/job_orchestration/executor/compression/task.py new file mode 100644 index 000000000..3460c9818 --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/compression/task.py @@ -0,0 +1,63 @@ +import json +import os +from contextlib import closing + +import pika +from celery.utils.log import get_task_logger + +from job_orchestration.executor.celery import app +from . import fs_to_fs_compress_method + +logger = get_task_logger(__name__) + +from clp_py_utils.clp_io_config import ClpIoConfig, PathsToCompress + + +@app.task() +def compress(job_id: int, task_id: int, clp_io_config_json: str, paths_to_compress_json: str, + database_connection_params): + clp_home = os.getenv('CLP_HOME') + data_dir = os.getenv('CLP_DATA_DIR') + logs_dir = os.getenv('CLP_LOGS_DIR') + celery_broker_url = os.getenv('BROKER_URL') + + logger.debug(f'CLP_HOME: {clp_home}') + logger.info(f'COMPRESSING job_id={job_id} task_id={task_id}') + + clp_io_config = ClpIoConfig.parse_raw(clp_io_config_json) + paths_to_compress = PathsToCompress.parse_raw(paths_to_compress_json) + + message = {'job_id': job_id, 'task_id': task_id, 'status': 'COMPRESSING'} + + with closing(pika.BlockingConnection(pika.URLParameters(celery_broker_url))) as conn: + with closing(conn.channel()) as channel: + channel.tx_select() + channel.queue_declare('results') + + channel.basic_publish(exchange='', routing_key='results', + body=json.dumps(message).encode('utf-8')) + channel.tx_commit() + logger.info(f'COMPRESSION STARTED job_id={job_id} task_id={task_id}') + + if 'fs' == clp_io_config.input.type and 'fs' == clp_io_config.output.type: + compression_successful, worker_output = \ + fs_to_fs_compress_method.compress( + clp_io_config, clp_home, data_dir, logs_dir, str(job_id), str(task_id), + paths_to_compress, database_connection_params) + else: + raise NotImplementedError + + if compression_successful: + message['status'] = 'COMPLETED' + message['total_uncompressed_size'] = worker_output['total_uncompressed_size'] + message['total_compressed_size'] = worker_output['total_compressed_size'] + else: + message['status'] = 'FAILED' + message['error_message'] = worker_output['error_message'] + + with closing(pika.BlockingConnection(pika.URLParameters(celery_broker_url))) as conn: + with closing(conn.channel()) as channel: + channel.tx_select() + channel.basic_publish(exchange='', routing_key='results', body=json.dumps(message).encode('utf-8')) + channel.tx_commit() + logger.info(f'COMPRESSION COMPLETED job_id={job_id} task_id={task_id}') diff --git a/components/job-orchestration/job_orchestration/scheduler/__init__.py b/components/job-orchestration/job_orchestration/scheduler/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/components/job-orchestration/job_orchestration/scheduler/results_consumer.py b/components/job-orchestration/job_orchestration/scheduler/results_consumer.py new file mode 100644 index 000000000..0db8c6185 --- /dev/null +++ b/components/job-orchestration/job_orchestration/scheduler/results_consumer.py @@ -0,0 +1,368 @@ +import functools +import logging +import time + +import pika +from pika.exchange_type import ExchangeType + +LOG_FORMAT = ('%(levelname) -10s %(asctime)s %(name) -30s %(funcName) ' + '-35s %(lineno) -5d: %(message)s') +LOGGER = logging.getLogger(__name__) + + +class ResultsConsumer(object): + """This is an example consumer that will handle unexpected interactions + with RabbitMQ such as channel and connection closures. + If RabbitMQ closes the connection, this class will stop and indicate + that reconnection is necessary. You should look at the output, as + there are limited reasons why the connection may be closed, which + usually are tied to permission related issues or socket timeouts. + If the channel is closed, it will indicate a problem with one of the + commands that were issued and that should surface in the output as well. + """ + EXCHANGE = 'results' + EXCHANGE_TYPE = ExchangeType.topic + QUEUE = 'results' + ROUTING_KEY = 'results' + + def __init__(self, amqp_url, on_messge_callback): + """Create a new instance of the consumer class, passing in the AMQP + URL used to connect to RabbitMQ. + :param str amqp_url: The AMQP url to connect with + """ + self.should_reconnect = False + self.was_consuming = False + + self._connection = None + self._channel = None + self._closing = False + self._consumer_tag = None + self._url = amqp_url + self._consuming = False + # In production, experiment with higher prefetch values + # for higher consumer throughput + self._prefetch_count = 1 + self.on_message = on_messge_callback + + def connect(self): + """This method connects to RabbitMQ, returning the connection handle. + When the connection is established, the on_connection_open method + will be invoked by pika. + :rtype: pika.SelectConnection + """ + LOGGER.info('Connecting to %s', self._url) + return pika.SelectConnection( + parameters=pika.URLParameters(self._url), + on_open_callback=self.on_connection_open, + on_open_error_callback=self.on_connection_open_error, + on_close_callback=self.on_connection_closed) + + def close_connection(self): + self._consuming = False + if self._connection.is_closing or self._connection.is_closed: + LOGGER.info('Connection is closing or already closed') + else: + LOGGER.info('Closing connection') + self._connection.close() + + def on_connection_open(self, _unused_connection): + """This method is called by pika once the connection to RabbitMQ has + been established. It passes the handle to the connection object in + case we need it, but in this case, we'll just mark it unused. + :param pika.SelectConnection _unused_connection: The connection + """ + LOGGER.info('Connection opened') + self.open_channel() + + def on_connection_open_error(self, _unused_connection, err): + """This method is called by pika if the connection to RabbitMQ + can't be established. + :param pika.SelectConnection _unused_connection: The connection + :param Exception err: The error + """ + LOGGER.error('Connection open failed: %s', err) + self.reconnect() + + def on_connection_closed(self, _unused_connection, reason): + """This method is invoked by pika when the connection to RabbitMQ is + closed unexpectedly. Since it is unexpected, we will reconnect to + RabbitMQ if it disconnects. + :param pika.connection.Connection connection: The closed connection obj + :param Exception reason: exception representing reason for loss of + connection. + """ + self._channel = None + if self._closing: + self._connection.ioloop.stop() + else: + LOGGER.warning('Connection closed, reconnect necessary: %s', reason) + self.reconnect() + + def reconnect(self): + """Will be invoked if the connection can't be opened or is + closed. Indicates that a reconnect is necessary then stops the + ioloop. + """ + self.should_reconnect = True + self.stop() + + def open_channel(self): + """Open a new channel with RabbitMQ by issuing the Channel.Open RPC + command. When RabbitMQ responds that the channel is open, the + on_channel_open callback will be invoked by pika. + """ + LOGGER.info('Creating a new channel') + self._connection.channel(on_open_callback=self.on_channel_open) + + def on_channel_open(self, channel): + """This method is invoked by pika when the channel has been opened. + The channel object is passed in so we can make use of it. + Since the channel is now open, we'll declare the exchange to use. + :param pika.channel.Channel channel: The channel object + """ + LOGGER.info('Channel opened') + self._channel = channel + self.add_on_channel_close_callback() + self.setup_exchange(self.EXCHANGE) + + def add_on_channel_close_callback(self): + """This method tells pika to call the on_channel_closed method if + RabbitMQ unexpectedly closes the channel. + """ + LOGGER.info('Adding channel close callback') + self._channel.add_on_close_callback(self.on_channel_closed) + + def on_channel_closed(self, channel, reason): + """Invoked by pika when RabbitMQ unexpectedly closes the channel. + Channels are usually closed if you attempt to do something that + violates the protocol, such as re-declare an exchange or queue with + different parameters. In this case, we'll close the connection + to shutdown the object. + :param pika.channel.Channel: The closed channel + :param Exception reason: why the channel was closed + """ + LOGGER.warning('Channel %i was closed: %s', channel, reason) + self.close_connection() + + def setup_exchange(self, exchange_name): + """Setup the exchange on RabbitMQ by invoking the Exchange.Declare RPC + command. When it is complete, the on_exchange_declareok method will + be invoked by pika. + :param str|unicode exchange_name: The name of the exchange to declare + """ + LOGGER.info('Declaring exchange: %s', exchange_name) + # Note: using functools.partial is not required, it is demonstrating + # how arbitrary data can be passed to the callback when it is called + cb = functools.partial( + self.on_exchange_declareok, userdata=exchange_name) + self._channel.exchange_declare( + exchange=exchange_name, + exchange_type=self.EXCHANGE_TYPE, + callback=cb) + + def on_exchange_declareok(self, _unused_frame, userdata): + """Invoked by pika when RabbitMQ has finished the Exchange.Declare RPC + command. + :param pika.Frame.Method unused_frame: Exchange.DeclareOk response frame + :param str|unicode userdata: Extra user data (exchange name) + """ + LOGGER.info('Exchange declared: %s', userdata) + self.setup_queue(self.QUEUE) + + def setup_queue(self, queue_name): + """Setup the queue on RabbitMQ by invoking the Queue.Declare RPC + command. When it is complete, the on_queue_declareok method will + be invoked by pika. + :param str|unicode queue_name: The name of the queue to declare. + """ + LOGGER.info('Declaring queue %s', queue_name) + cb = functools.partial(self.on_queue_declareok, userdata=queue_name) + self._channel.queue_declare(queue=queue_name, callback=cb) + + def on_queue_declareok(self, _unused_frame, userdata): + """Method invoked by pika when the Queue.Declare RPC call made in + setup_queue has completed. In this method we will bind the queue + and exchange together with the routing key by issuing the Queue.Bind + RPC command. When this command is complete, the on_bindok method will + be invoked by pika. + :param pika.frame.Method _unused_frame: The Queue.DeclareOk frame + :param str|unicode userdata: Extra user data (queue name) + """ + queue_name = userdata + LOGGER.info('Binding %s to %s with %s', self.EXCHANGE, queue_name, + self.ROUTING_KEY) + cb = functools.partial(self.on_bindok, userdata=queue_name) + self._channel.queue_bind( + queue_name, + self.EXCHANGE, + routing_key=self.ROUTING_KEY, + callback=cb) + + def on_bindok(self, _unused_frame, userdata): + """Invoked by pika when the Queue.Bind method has completed. At this + point we will set the prefetch count for the channel. + :param pika.frame.Method _unused_frame: The Queue.BindOk response frame + :param str|unicode userdata: Extra user data (queue name) + """ + LOGGER.info('Queue bound: %s', userdata) + self.set_qos() + + def set_qos(self): + """This method sets up the consumer prefetch to only be delivered + one message at a time. The consumer must acknowledge this message + before RabbitMQ will deliver another one. You should experiment + with different prefetch values to achieve desired performance. + """ + self._channel.basic_qos( + prefetch_count=self._prefetch_count, callback=self.on_basic_qos_ok) + + def on_basic_qos_ok(self, _unused_frame): + """Invoked by pika when the Basic.QoS method has completed. At this + point we will start consuming messages by calling start_consuming + which will invoke the needed RPC commands to start the process. + :param pika.frame.Method _unused_frame: The Basic.QosOk response frame + """ + LOGGER.info('QOS set to: %d', self._prefetch_count) + self.start_consuming() + + def start_consuming(self): + """This method sets up the consumer by first calling + add_on_cancel_callback so that the object is notified if RabbitMQ + cancels the consumer. It then issues the Basic.Consume RPC command + which returns the consumer tag that is used to uniquely identify the + consumer with RabbitMQ. We keep the value to use it when we want to + cancel consuming. The on_message method is passed in as a callback pika + will invoke when a message is fully received. + """ + LOGGER.info('Issuing consumer related RPC commands') + self.add_on_cancel_callback() + self._consumer_tag = self._channel.basic_consume( + self.QUEUE, self.on_message) + self.was_consuming = True + self._consuming = True + + def add_on_cancel_callback(self): + """Add a callback that will be invoked if RabbitMQ cancels the consumer + for some reason. If RabbitMQ does cancel the consumer, + on_consumer_cancelled will be invoked by pika. + """ + LOGGER.info('Adding consumer cancellation callback') + self._channel.add_on_cancel_callback(self.on_consumer_cancelled) + + def on_consumer_cancelled(self, method_frame): + """Invoked by pika when RabbitMQ sends a Basic.Cancel for a consumer + receiving messages. + :param pika.frame.Method method_frame: The Basic.Cancel frame + """ + LOGGER.info('Consumer was cancelled remotely, shutting down: %r', + method_frame) + if self._channel: + self._channel.close() + + def acknowledge_message(self, delivery_tag): + """Acknowledge the message delivery from RabbitMQ by sending a + Basic.Ack RPC method for the delivery tag. + :param int delivery_tag: The delivery tag from the Basic.Deliver frame + """ + LOGGER.info('Acknowledging message %s', delivery_tag) + self._channel.basic_ack(delivery_tag) + + def stop_consuming(self): + """Tell RabbitMQ that you would like to stop consuming by sending the + Basic.Cancel RPC command. + """ + if self._channel: + LOGGER.info('Sending a Basic.Cancel RPC command to RabbitMQ') + cb = functools.partial( + self.on_cancelok, userdata=self._consumer_tag) + self._channel.basic_cancel(self._consumer_tag, cb) + + def on_cancelok(self, _unused_frame, userdata): + """This method is invoked by pika when RabbitMQ acknowledges the + cancellation of a consumer. At this point we will close the channel. + This will invoke the on_channel_closed method once the channel has been + closed, which will in-turn close the connection. + :param pika.frame.Method _unused_frame: The Basic.CancelOk frame + :param str|unicode userdata: Extra user data (consumer tag) + """ + self._consuming = False + LOGGER.info( + 'RabbitMQ acknowledged the cancellation of the consumer: %s', + userdata) + self.close_channel() + + def close_channel(self): + """Call to close the channel with RabbitMQ cleanly by issuing the + Channel.Close RPC command. + """ + LOGGER.info('Closing the channel') + self._channel.close() + + def run(self): + """Run the example consumer by connecting to RabbitMQ and then + starting the IOLoop to block and allow the SelectConnection to operate. + """ + self._connection = self.connect() + self._connection.ioloop.start() + + def stop(self): + """Cleanly shutdown the connection to RabbitMQ by stopping the consumer + with RabbitMQ. When RabbitMQ confirms the cancellation, on_cancelok + will be invoked by pika, which will then closing the channel and + connection. The IOLoop is started again because this method is invoked + when CTRL-C is pressed raising a KeyboardInterrupt exception. This + exception stops the IOLoop which needs to be running for pika to + communicate with RabbitMQ. All of the commands issued prior to starting + the IOLoop will be buffered but not processed. + """ + if not self._closing: + self._closing = True + LOGGER.info('Stopping') + if self._consuming: + self.stop_consuming() + self._connection.ioloop.start() + else: + self._connection.ioloop.stop() + LOGGER.info('Stopped') + + +class ReconnectingResultsConsumer(object): + """This is an example consumer that will reconnect if the nested + ResultsConsumer indicates that a reconnect is necessary. + """ + + def __init__(self, amqp_url, on_message_callback): + self._reconnect_delay = 0 + self._amqp_url = amqp_url + self._on_message_callback = on_message_callback + self._consumer = ResultsConsumer(self._amqp_url, self._on_message_callback) + + def run(self): + while True: + try: + self._consumer.run() + except KeyboardInterrupt: + self._consumer.stop() + break + self._maybe_reconnect() + + def _maybe_reconnect(self): + if self._consumer.should_reconnect: + self._consumer.stop() + reconnect_delay = self._get_reconnect_delay() + LOGGER.info('Reconnecting after %d seconds', reconnect_delay) + time.sleep(reconnect_delay) + self._consumer = ResultsConsumer(self._amqp_url, self._on_message_callback) + + def _get_reconnect_delay(self): + if self._consumer.was_consuming: + self._reconnect_delay = 0 + else: + self._reconnect_delay += 1 + if self._reconnect_delay > 30: + self._reconnect_delay = 30 + return self._reconnect_delay + + +if __name__ == '__main__': + pass \ No newline at end of file diff --git a/components/job-orchestration/job_orchestration/scheduler/scheduler.py b/components/job-orchestration/job_orchestration/scheduler/scheduler.py new file mode 100644 index 000000000..a6d0c92bd --- /dev/null +++ b/components/job-orchestration/job_orchestration/scheduler/scheduler.py @@ -0,0 +1,343 @@ +import argparse +import datetime +import logging +import os +import pathlib +import sys +import threading +import time +import typing +from contextlib import closing + +import zstandard +from pydantic import ValidationError + +from clp_py_utils.clp_config import CLPConfig, Database +from clp_py_utils.sql_adapter import SQL_Adapter +from job_orchestration.executor.compression.task import compress +from job_orchestration.scheduler.results_consumer import ReconnectingResultsConsumer +from job_orchestration.scheduler.scheduler_data \ + import Job, Task, TaskUpdate, TaskCompletionUpdate, TaskFailureUpdate + +# Setup logging +# Create logger +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.INFO) +console_handler.setFormatter( + logging.Formatter('%(asctime)s [%(levelname)s] [%(name)s] %(message)s')) +log = logging.getLogger('scheduler') +log.addHandler(console_handler) +log.setLevel(logging.DEBUG) + +scheduled_jobs = {} +jobs_lock = threading.Lock() + +from clp_py_utils.core import read_yaml_config_file + + +def fetch_new_task_metadata(db_cursor) -> list: + db_cursor.execute( + """ + SELECT compression_jobs.job_id, + compression_jobs.job_status, + compression_jobs.num_tasks, + compression_jobs.num_tasks_completed, + compression_jobs.clp_config, + compression_tasks.task_id, + compression_tasks.task_status, + compression_tasks.clp_paths_to_compress + FROM compression_jobs INNER JOIN compression_tasks + ON compression_jobs.job_id=compression_tasks.job_id + WHERE compression_tasks.task_status='SUBMITTED'; + """ + ) + return db_cursor.fetchall() + + +def update_task_metadata(db_cursor, task_id, kv: typing.Dict[str, typing.Any]): + if not len(kv): + log.error("Must specify at least one field to update") + raise ValueError + + field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()] + query = f'UPDATE compression_tasks SET {", ".join(field_set_expressions)} ' \ + f'WHERE task_id={task_id};' + db_cursor.execute(query) + + +def update_job_metadata(db_cursor, job_id, kv): + if not len(kv): + log.error("Must specify at least one field to update") + raise ValueError + + field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()] + query = f'UPDATE compression_jobs SET {", ".join(field_set_expressions)} ' \ + f'WHERE job_id={job_id};' + db_cursor.execute(query) + + +def increment_job_metadata(db_cursor, job_id, kv): + if not len(kv): + log.error("Must specify at least one field to increment") + raise ValueError + + field_set_expressions = [f'{k}={k}+{v}' for k, v in kv.items()] + query = f'UPDATE compression_jobs SET {", ".join(field_set_expressions)} ' \ + f'WHERE job_id={job_id};' + db_cursor.execute(query) + + +def schedule_task(job: Job, task: Task, database_config: Database, dctx: zstandard.ZstdDecompressor = None): + return compress.apply_async( + (job.job_id, task.task_id, + job.get_clp_config_json(dctx), + task.get_clp_paths_to_compress_json(dctx), + database_config.get_clp_connection_params_and_type()), + task_id=str(task.task_id), queue='compression', priority=task.priority) + + +def search_and_schedule_new_tasks(db_conn, db_cursor, database_config: Database): + """ + For all task with SUBMITTED status, push them to task queue to be processed, if finished, update them + """ + global scheduled_jobs + global jobs_lock + + log.debug('Search and schedule new tasks') + + dctx = zstandard.ZstdDecompressor() + + # Fetch new task + for task_row in fetch_new_task_metadata(db_cursor): + log.debug(f"Found task with job_id={task_row['job_id']} task_id={task_row['task_id']}") + + # Only Add database credentials to ephemeral task specification passed to workers + task = Task.parse_obj(task_row) + job_id: int = task_row['job_id'] + + with jobs_lock: + now = datetime.datetime.utcnow() + + try: + job = scheduled_jobs[job_id] + except KeyError: + # Identified a new job identified + job = Job(job_start_time=now, **task_row) + update_job_metadata(db_cursor, job_id, dict( + job_start_time=now.strftime('%Y-%m-%d %H:%M:%S') + )) + + # Schedule task, update ephemeral metadata in scheduler and commit to database + celery_task_instance = schedule_task(job, task, database_config, dctx) + + update_task_metadata(db_cursor, task.task_id, dict( + task_status='SCHEDULED', + task_scheduled_time=now.strftime('%Y-%m-%d %H:%M:%S') + )) + db_conn.commit() + + # After database commit is successful, update internal metadata + task.instance = celery_task_instance + task.task_status = 'SCHEDULED' + job.tasks[task.task_id] = task + + # Optimization: if job has finished scheduling while we are scheduling task, + # Then we'll update the job's status and num_tasks count + try: + if 'SCHEDULED' == task_row['job_status']: + job.num_tasks = task_row['num_tasks'] + job.job_status = task_row['job_status'] + except KeyError: + pass + + scheduled_jobs[job_id] = job + db_conn.commit() + + +def update_completed_jobs(db_conn, db_cursor): + # Update completed jobs if there are any + db_cursor.execute( + """ + UPDATE compression_jobs + SET job_status="COMPLETED", job_duration=TIMESTAMPDIFF(SECOND,job_start_time, CURRENT_TIMESTAMP()) + WHERE job_status="SCHEDULED" AND num_tasks=num_tasks_completed; + """ + ) + db_conn.commit() + + +def task_results_consumer(sql_adapter: SQL_Adapter, celery_broker_url): + global scheduled_jobs + global jobs_lock + + def callback(ch, method, properties, body): + global scheduled_jobs + global jobs_lock + global log + + try: + # Validate message body + task_update = TaskUpdate.parse_raw(body) + if 'COMPLETED' == task_update.status: + task_update = TaskCompletionUpdate.parse_raw(body) + elif 'FAILED' == task_update.status: + task_update = TaskFailureUpdate.parse_raw(body) + except ValidationError as err: + log.error(err) + exit(-1) + + with closing(sql_adapter.create_connection()) as db_conn, \ + closing(db_conn.cursor(dictionary=True)) as db_cursor, jobs_lock: + log.debug(f'Task update received: ' + f'job_id={task_update.job_id} ' + f'task_id={task_update.task_id} ' + f'status={task_update.status}') + + # Retrieve scheduler state + try: + job = scheduled_jobs[task_update.job_id] + task = job.tasks[task_update.task_id] + except KeyError: + # Scheduler detected response from task which it does not keep track of + # It could be that previous scheduler crashed. + # The only thing we can do is to log, and discard the message + # to prevent infinite loop + log.warning(f'Discarding untracked task update: {task_update.json()}') + ch.basic_ack(method.delivery_tag) + return + + # Process task update and update database + try: + # Scheduler is aware of the task + now = datetime.datetime.utcnow() + + if 'COMPRESSING' == task_update.status: + # Update sent by worker when task began in the database + update_task_metadata(db_cursor, task_update.task_id, dict( + task_status=task_update.status, + task_start_time=now.strftime('%Y-%m-%d %H:%M:%S') + )) + elif 'COMPLETED' == task_update.status: + # Update sent by worker when task finishes + if 'COMPRESSING' != task.task_status: + log.warning(f'Discarding untracked task update: {task_update.json()}') + ch.basic_ack(method.delivery_tag) + raise NotImplementedError + + task_duration = max(int((now - task.task_start_time).total_seconds()), 1) + + log.info(f'Task job-{task_update.job_id}-task-{task_update.task_id} ' + f'completed in {task_duration} second.') + + update_task_metadata(db_cursor, task_update.task_id, dict( + task_status=task_update.status, + partition_uncompressed_size=task_update.total_uncompressed_size, + partition_compressed_size=task_update.total_compressed_size, + task_duration=int(task_duration) + )) + increment_job_metadata(db_cursor, task_update.job_id, dict( + job_uncompressed_size=task_update.total_uncompressed_size, + job_compressed_size=task_update.total_compressed_size, + num_tasks_completed=1 + )) + elif 'FAILED' == task_update.status: + log.warning(f'Marking job_id={task_update.job_id} as failed.') + log.warning(str(task_update.error_message)) + update_task_metadata(db_cursor, task_update.task_id, dict( + task_status=task_update.status, + task_duration=int((now - task.task_start_time).total_seconds()) + )) + update_job_metadata(db_cursor, job.job_id, dict( + job_status=task_update.status, + job_status_msg=task_update.error_message + )) + else: + raise NotImplementedError + + db_conn.commit() + + # Only update scheduler metadata only after transaction finishes + # If update fails, rollback and avoid updating scheduler state + job.tasks[task_update.task_id].task_status = task_update.status + if 'COMPRESSING' == task_update.status: + job.tasks[task_update.task_id].task_start_time = now + elif 'COMPLETED' == task_update.status: + job.num_tasks_completed += 1 + elif 'FAILED' == task_update.status: + # TODO: how to handle failure scheduler state update besides simply recording acknowledgement? + job.job_status = task_update.status + pass + else: + raise NotImplementedError + + # Only send out the ACK if data successfully persisted to the database + ch.basic_ack(method.delivery_tag) + + except Exception as error: + # Transaction failure, rollback, don't send ACK and simply reprocess the msg again + log.error(f'Database update failed: {error}.') + db_conn.rollback() + + consumer = ReconnectingResultsConsumer(celery_broker_url, callback) + consumer_thread = threading.Thread(target=consumer.run) + consumer_thread.start() + return consumer + + +def main(argv): + global scheduled_jobs + args_parser = argparse.ArgumentParser() + args_parser.add_argument('--config', '-c', required=True, help='CLP configuration file.') + args = args_parser.parse_args(argv[1:]) + + celery_broker_url = os.getenv('BROKER_URL') + + # Load configuration + config_path = pathlib.Path(args.config) + try: + clp_config = CLPConfig.parse_obj(read_yaml_config_file(config_path)) + except ValidationError as err: + log.error(err) + except Exception as ex: + log.error(ex) + # read_yaml_config_file already logs the parsing error inside + pass + else: + # Collect new jobs from the database + log.info('Starting CLP job scheduler') + sql_adapter = SQL_Adapter(clp_config.database) + + results_consumer = task_results_consumer(sql_adapter, celery_broker_url) + + while True: + try: + # Start Job Processing Loop + with closing(sql_adapter.create_connection()) as db_conn, \ + closing(db_conn.cursor(dictionary=True)) as db_cursor: + search_and_schedule_new_tasks(db_conn, db_cursor, sql_adapter.database_config) + update_completed_jobs(db_conn, db_cursor) + except Exception as ex: + log.error('Error in scheduling: ') + log.error(ex) + finally: + try: + time.sleep(clp_config.scheduler.jobs_poll_delay) + except KeyboardInterrupt: + log.info('Gracefully shutting down') + break + + if results_consumer: + try: + results_consumer._consumer.stop() + except RuntimeError as err: + if 'IOLoop is not reentrant and is already running' != str(err): + log.error(err) + raise RuntimeError + else: + # Normal graceful shutdown path + pass + log.info('Scheduler stopped') + + +if '__main__' == __name__: + main(sys.argv) diff --git a/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py new file mode 100644 index 000000000..8c8b411a4 --- /dev/null +++ b/components/job-orchestration/job_orchestration/scheduler/scheduler_data.py @@ -0,0 +1,63 @@ +import datetime +import json +import typing +from typing import Dict + +import msgpack +import zstandard +from celery.result import AsyncResult +from pydantic import BaseModel, validator + + +class TaskUpdate(BaseModel): + job_id: int + task_id: int + status: str + + @validator('status') + def valid_status(cls, field): + supported_status = ['COMPRESSING', 'COMPLETED', 'FAILED'] + if field not in supported_status: + raise ValueError(f'must be one of the following {"|".join(supported_status)}') + return field + + +class TaskCompletionUpdate(TaskUpdate): + total_uncompressed_size: int + total_compressed_size: int + + +class TaskFailureUpdate(TaskUpdate): + error_message: str + + +class Task(BaseModel): + task_id: int + task_status: str + priority: int = 1 + clp_paths_to_compress: bytes + task_start_time: datetime.datetime = None + instance: AsyncResult = None + + class Config: + arbitrary_types_allowed = True + + def get_clp_paths_to_compress_json(self, dctx: zstandard.ZstdDecompressor = None): + if dctx is None: + dctx = zstandard.ZstdDecompressor() + return json.dumps(msgpack.unpackb(dctx.decompress(self.clp_paths_to_compress))) + + +class Job(BaseModel): + job_id: int + job_status: str + job_start_time: datetime.datetime + clp_config: bytes + num_tasks: typing.Optional[int] + num_tasks_completed: int + tasks: Dict[int, Task] = {} + + def get_clp_config_json(self, dctx: zstandard.ZstdDecompressor = None): + if not dctx: + dctx = zstandard.ZstdDecompressor() + return json.dumps(msgpack.unpackb(dctx.decompress(self.clp_config))) diff --git a/components/job-orchestration/requirements.txt b/components/job-orchestration/requirements.txt new file mode 100644 index 000000000..0e8b2eb39 --- /dev/null +++ b/components/job-orchestration/requirements.txt @@ -0,0 +1,8 @@ +python-Levenshtein +pika==1.2.0 +celery==5.1.2 +msgpack~=1.0.2 +zstandard~=0.15.2 +mysql-connector-python==8.0.26 +pydantic==1.8.2 +PyYAML==5.4 diff --git a/components/package-template/README.md b/components/package-template/README.md new file mode 100644 index 000000000..86f70b507 --- /dev/null +++ b/components/package-template/README.md @@ -0,0 +1,5 @@ +# Package Template + +This component contains the base directory structure and files of the CLP package. + +*NOTE: This is only a small part of the complete CLP package and cannot be run alone.* diff --git a/components/package-template/src/.gitignore b/components/package-template/src/.gitignore new file mode 100644 index 000000000..3283d6cd6 --- /dev/null +++ b/components/package-template/src/.gitignore @@ -0,0 +1 @@ +etc/clp-config.yaml diff --git a/components/package-template/src/LICENSE b/components/package-template/src/LICENSE new file mode 100644 index 000000000..3340c889f --- /dev/null +++ b/components/package-template/src/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021 YScope Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/components/package-template/src/README.md b/components/package-template/src/README.md new file mode 100644 index 000000000..96883fde3 --- /dev/null +++ b/components/package-template/src/README.md @@ -0,0 +1,116 @@ +# CLP + +Compressed Log Processor (CLP) is a tool that compresses text logs and allows users to search the compressed data +without decompression. CLP's compression ratio is significantly higher than gzip. + +## Getting started + +CLP can be run in Docker containers, in one of two modes: +* On a single-node (typically for development and testing) +* Across multiple nodes + +## Single-node deployment + +### Requirements + +* [Docker](https://docs.docker.com/engine/install/) + * `docker` should be in the user's path, and + * [runnable without superuser privileges](https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user) + (without sudo) +* Plenty of disk space +* Python3 + * For systems with a version < 3.7, run `pip3 install -r requirements-pre-3.7.txt` + +### Starting CLP + +```bash +./sbin/start-clp --uncompressed-logs-dir +``` + +Note that running CLP in containers means that the `uncompressed-logs-dir` must be mounted inside the container. +Therefore: +* The `uncompressed-logs-dir` must not include symbolic links to items **outside** of the directory +* Changing `uncompressed-logs-dir` requires restarting CLP. + +### Stopping CLP + +```bash +./sbin/stop-clp +``` + +## Multi-node deployment + +### Requirements + +* The single-node deployment requirements +* For the scheduler node, port 3306 and 5672 must be available and accessible from all compute nodes +* A distributed file system mounted at the same path on all nodes + +### Starting the scheduler + +```bash +sbin/start-clp --start-scheduler-only --publish-ports \ + --uncompressed-logs-dir +``` + +### Starting the worker(s) + +```bash +sbin/start-clp --start-worker-only --publish-ports \ + --uncompressed-logs-dir +``` + +### Stopping components + +Every component can be stopped by: +```bash +./sbin/stop-clp +``` + +## Usage + +Once CLP is started, you can use it as follows. + +### Compressing logs + +```bash +./sbin/compress +``` + +Note: +* The uncompressed logs must be within `uncompressed-logs-dir` +* CLP is designed to compress text logs + +For more options, run the script with the `--help` option. + +### Decompressing logs + +To decompress all compressed logs: +```bash +./sbin/decompress -d +``` +For more options, run the script with the `--help` option. + +### Searching logs + +To search all logs for a given wildcard query: +```bash +./sbin/search +``` + +CLP supports two wildcard characters: +* `*` which matches 0 or more characters +* `?` which matches any single character + +For more options, run the script with the `--help` option. + +## Troubleshooting + +### ModuleNotFoundError + +**Error message**: ```ModuleNotFoundError: No module named 'dataclasses'``` + +**Cause**: When starting the package on some older platforms like Ubuntu 18.04, some required Python modules are not in +the standard library + +**Solution**: `pip install -r requirements-pre-3.7.txt` diff --git a/components/package-template/src/etc/clp-config.yaml.template b/components/package-template/src/etc/clp-config.yaml.template new file mode 100644 index 000000000..e10728715 --- /dev/null +++ b/components/package-template/src/etc/clp-config.yaml.template @@ -0,0 +1,14 @@ +clp_cluster_name: clp-mini-cluster + +archive_output: + # How much data CLP should try to compress into each archive + target_archive_size: 268435456 # 256MB + + # How large the dictionaries should be allowed to get before the archive is closed and a new one is created + target_dictionaries_size: 33554432 # 32MB + + # How large each encoded file should be before being split into a new encoded file + target_encoded_file_size: 268435456 # 256MB + + # How much data CLP should try to fit into each segment within an archive + target_segment_size: 268435456 # 256MB diff --git a/components/package-template/src/lib/python3/site-packages/clp/__init__.py b/components/package-template/src/lib/python3/site-packages/clp/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/components/package-template/src/lib/python3/site-packages/clp/package_utils.py b/components/package-template/src/lib/python3/site-packages/clp/package_utils.py new file mode 100644 index 000000000..854fe7576 --- /dev/null +++ b/components/package-template/src/lib/python3/site-packages/clp/package_utils.py @@ -0,0 +1,70 @@ +import json +import pathlib +import subprocess + +from clp_py_utils.clp_config import CLPConfig + + +def check_dependencies(): + try: + subprocess.run('command -v git', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True) + except subprocess.CalledProcessError: + raise EnvironmentError('git is not installed on the path.') + + try: + subprocess.run('command -v docker', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True) + subprocess.run(['docker', 'ps'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True) + except subprocess.CalledProcessError: + raise EnvironmentError('docker is not installed on the path or cannot run without superuser privileges (sudo).') + + +def check_env(cluster_name: str): + check_docker_network_bridge_cmd = ['docker', 'network', 'inspect', cluster_name] + proc = subprocess.run(check_docker_network_bridge_cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if 0 != proc.returncode: + raise EnvironmentError(f'Failed to inspect docker network bridge {cluster_name}') + + bridge_bridge_specification = json.loads(proc.stdout.decode('utf-8'))[0] + required_containers = {cluster_name} + for container_id, container in bridge_bridge_specification['Containers'].items(): + try: + required_containers.remove(container['Name']) + except KeyError: + pass + + if required_containers: + raise EnvironmentError(f'The required container is not started: {",".join(required_containers)}') + + +def prepare_package_and_config(clp_config: CLPConfig, clp_home: pathlib.Path, docker_clp_home: pathlib.Path): + host_data_directory = pathlib.Path(clp_config.data_directory) + if '' == host_data_directory.anchor: + # In the config file, we assume prefix is clp_home inside the docker (/root/clp) + host_data_directory = clp_home / clp_config.data_directory + clp_config.data_directory = str(docker_clp_home / clp_config.data_directory) + host_data_directory.mkdir(parents=True, exist_ok=True) + + host_log_directory = pathlib.Path(clp_config.logs_directory) + if '' == host_log_directory.anchor: + # In the config file, we assume prefix is clp_home, inside the docker (/root/clp) + host_log_directory = clp_home / clp_config.logs_directory + clp_config.logs_directory = str(docker_clp_home / clp_config.logs_directory) + host_log_directory.mkdir(parents=True, exist_ok=True) + + host_archive_output_directory = pathlib.Path(clp_config.archive_output.directory) + if '' == host_archive_output_directory.anchor: + # In the config file, we assume prefix is clp_home, inside the docker (/root/clp) + host_archive_output_directory = clp_home / clp_config.archive_output.directory + clp_config.archive_output.directory = \ + str(docker_clp_home / clp_config.archive_output.directory) + host_archive_output_directory.mkdir(parents=True, exist_ok=True) + + return host_data_directory, host_log_directory, host_archive_output_directory, clp_config + + +def make_config_path_absolute(clp_home: pathlib.Path, config_path: pathlib.Path): + if config_path.is_absolute(): + return config_path + else: + return clp_home / config_path diff --git a/components/package-template/src/requirements-pre-3.7.txt b/components/package-template/src/requirements-pre-3.7.txt new file mode 100644 index 000000000..60f564425 --- /dev/null +++ b/components/package-template/src/requirements-pre-3.7.txt @@ -0,0 +1 @@ +dataclasses==0.8 \ No newline at end of file diff --git a/components/package-template/src/sbin/compress b/components/package-template/src/sbin/compress new file mode 100755 index 000000000..7808ccb9c --- /dev/null +++ b/components/package-template/src/sbin/compress @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +import argparse +import logging +import os +import pathlib +import subprocess +import sys + +# Setup logging +# Create logger +log = logging.getLogger('clp') +log.setLevel(logging.DEBUG) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(name)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +log.addHandler(logging_console_handler) + + +def get_clp_home(): + clp_home = None + if 'CLP_HOME' in os.environ: + clp_home = pathlib.Path(os.environ['CLP_HOME']) + else: + for path in pathlib.Path(__file__).resolve().parents: + if 'sbin' == path.name: + clp_home = path.parent + break + + if clp_home is None: + log.error('CLP_HOME is not set and could not be determined automatically.') + return None + elif not clp_home.exists(): + log.error('CLP_HOME does not exist.') + return None + + return clp_home.resolve() + + +def load_bundled_python_lib_path(clp_home): + python_site_packages_path = clp_home / 'lib' / 'python3' / 'site-packages' + if not python_site_packages_path.is_dir(): + log.error('Failed to load python3 packages bundled with CLP.') + return -1 + # Add packages to the front of the path + sys.path.insert(0, str(python_site_packages_path)) + + +clp_home = get_clp_home() +if clp_home is None: + sys.exit(-1) +load_bundled_python_lib_path(clp_home) + +from clp.package_utils import check_env +from clp_py_utils.core import read_yaml_config_file +from clp_py_utils.clp_package_config import CLPPackageConfig +from pydantic import ValidationError + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Startup script for CLP') + args_parser.add_argument('--config', '-c', type=str, help='CLP package configuration file.') + args_parser.add_argument('paths', metavar='PATH', nargs='*', help='Paths to compress.') + args_parser.add_argument('-f', '--input-list', dest='input_list', help='A file listing all paths to compress.') + parsed_args = args_parser.parse_args(argv[1:]) + + # Infer config file path + try: + if not parsed_args.config: + # Did not provide a config file + default_clp_package_config_file = clp_home / 'etc' / 'clp-config.yaml' + if not default_clp_package_config_file.exists(): + raise FileNotFoundError + log.info(f'Using default config file at {default_clp_package_config_file.relative_to(pathlib.Path.cwd())}') + package_config_file_path = default_clp_package_config_file + else: + # Provided a config file + package_config_file_path = pathlib.Path(parsed_args.config).resolve(strict=True) + except FileNotFoundError: + log.error('Did not provide a clp package config file or the specified config file does not exist.') + return + + try: + clp_package_config = CLPPackageConfig.parse_obj(read_yaml_config_file(package_config_file_path)) + except ValidationError as err: + log.error(err) + return + except Exception as ex: + # read_yaml_config_file already logs the parsing error inside + return + + clp_cluster_name = clp_package_config.cluster_name + try: + check_env(clp_cluster_name) + except EnvironmentError as ex: + logging.error(ex) + return -1 + + # TODO: check path and perform path conversion + docker_exec_cmd = [ + 'docker', 'exec', + '--workdir', '/root/clp', + clp_package_config.cluster_name, + 'sbin/native/compress', '--config', f'/root/.{clp_package_config.cluster_name}.yaml' + ] + for path in parsed_args.paths: + docker_exec_cmd.append(path) + if parsed_args.input_list is not None: + docker_exec_cmd.append('--input-list') + docker_exec_cmd.append(parsed_args.input_list) + logging.info(docker_exec_cmd) + subprocess.run(docker_exec_cmd) + + return 0 + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/components/package-template/src/sbin/decompress b/components/package-template/src/sbin/decompress new file mode 100755 index 000000000..acca6e3d4 --- /dev/null +++ b/components/package-template/src/sbin/decompress @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +import argparse +import logging +import os +import pathlib +import shutil +import subprocess +import sys +import uuid + +# Setup logging +# Create logger +log = logging.getLogger('clp') +log.setLevel(logging.DEBUG) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(name)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +log.addHandler(logging_console_handler) + + +def get_clp_home(): + clp_home = None + if 'CLP_HOME' in os.environ: + clp_home = pathlib.Path(os.environ['CLP_HOME']) + else: + for path in pathlib.Path(__file__).resolve().parents: + if 'sbin' == path.name: + clp_home = path.parent + break + + if clp_home is None: + log.error('CLP_HOME is not set and could not be determined automatically.') + return None + elif not clp_home.exists(): + log.error('CLP_HOME does not exist.') + return None + + return clp_home.resolve() + + +def load_bundled_python_lib_path(clp_home): + python_site_packages_path = clp_home / 'lib' / 'python3' / 'site-packages' + if not python_site_packages_path.is_dir(): + log.error('Failed to load python3 packages bundled with CLP.') + return -1 + # Add packages to the front of the path + sys.path.insert(0, str(python_site_packages_path)) + + +clp_home = get_clp_home() +if clp_home is None: + sys.exit(-1) +load_bundled_python_lib_path(clp_home) + +from clp.package_utils import check_env +from clp_py_utils.core import read_yaml_config_file +from clp_py_utils.clp_package_config import CLPPackageConfig +from clp_py_utils.clp_config import CLPConfig +from pydantic import ValidationError + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Script to decompress logs') + args_parser.add_argument('--config', '-c', type=str, help='CLP package configuration file.') + args_parser.add_argument('paths', metavar='PATH', nargs='*', help='Paths to decompress.') + args_parser.add_argument('-f', '--files-from', help='A file listing all files to decompress.') + args_parser.add_argument('-d', '--extraction-dir', metavar='DIR', default='.', help='Decompress files into DIR') + parsed_args = args_parser.parse_args(argv[1:]) + + # Infer config file path + try: + if not parsed_args.config: + # Did not provide a config file + default_clp_package_config_file = clp_home / 'etc' / 'clp-config.yaml' + if not default_clp_package_config_file.exists(): + raise FileNotFoundError + log.info(f'Using default config file at {default_clp_package_config_file.relative_to(pathlib.Path.cwd())}') + package_config_file_path = default_clp_package_config_file + else: + # Provided a config file + package_config_file_path = pathlib.Path(parsed_args.config).resolve(strict=True) + except FileNotFoundError: + log.error('Did not provide a clp package config file or the specified config file does not exist.') + return + + try: + clp_package_config = CLPPackageConfig.parse_obj(read_yaml_config_file(package_config_file_path)) + except ValidationError as err: + log.error(err) + return + except Exception as ex: + # read_yaml_config_file already logs the parsing error inside + return + + # Validate paths were specified using only one method + if len(parsed_args.paths) > 0 and parsed_args.files_from is not None: + args_parser.error( + "Paths cannot be specified both on the command line and through a file.") + return -1 + + files_to_decompress_path = None + if parsed_args.files_from: + files_to_decompress_path = pathlib.Path(parsed_args.files_from).resolve(strict=True) + + # Validate extraction directory + extraction_dir = pathlib.Path(parsed_args.extraction_dir).resolve() + if extraction_dir.exists() and not extraction_dir.is_dir(): + log.error(f'extraction-dir ({extraction_dir}) is not a valid directory.') + return -1 + extraction_dir.mkdir(exist_ok=True) + + clp_cluster_name = clp_package_config.cluster_name + try: + check_env(clp_cluster_name) + except EnvironmentError as ex: + log.error(ex) + return -1 + + # Parse and validate config file + container_clp_config_file_name = f'.{clp_package_config.cluster_name}.yaml' + host_config_file_path = clp_home / container_clp_config_file_name + container_config_file_path = f'/root/{container_clp_config_file_name}' + + try: + clp_config = CLPConfig.parse_obj(read_yaml_config_file(host_config_file_path)) + except ValidationError as err: + log.error(err) + return -1 + except Exception as ex: + log.error(ex) + return -1 + + docker_clp_home = pathlib.Path('/') / 'root' / 'clp' + docker_extraction_dir = pathlib.Path('/') / 'mnt' / '_extraction_dir_' + + host_data_directory = clp_home / pathlib.Path(clp_config.data_directory).relative_to(docker_clp_home) + host_log_directory = clp_home / pathlib.Path(clp_config.logs_directory).relative_to(docker_clp_home) + host_archive_out_directory = \ + clp_home / pathlib.Path(clp_config.archive_output.directory).relative_to(docker_clp_home) + + # Start execution environment + clp_execution_env_container = 'whywhywhywhywhywhy/clp-execution-env:x86-ubuntu-focal-20210919' + container_name = f'{clp_cluster_name}-decompressor-{uuid.uuid4()}'[:62] # max docker hostname = 63 chars + clp_execution_env_startup_cmd = [ + 'docker', 'run', '-di', + '--rm', + '--network', clp_cluster_name, + '--hostname', container_name, + '--name', container_name, + '-v', f'{clp_home}:{docker_clp_home}', + '-v', f'{extraction_dir}:{docker_extraction_dir}' + ] + if not clp_config.data_directory.startswith('/root/clp'): + clp_execution_env_startup_cmd.append('-v') + clp_execution_env_startup_cmd.append(f'{host_data_directory}:{clp_config.data_directory}') + if not clp_config.logs_directory.startswith('/root/clp'): + clp_execution_env_startup_cmd.append('-v') + clp_execution_env_startup_cmd.append(f'{host_log_directory}:{clp_config.logs_directory}') + if not clp_config.archive_output.directory.startswith('/root/clp'): + clp_execution_env_startup_cmd.append('-v') + clp_execution_env_startup_cmd.append( + f'{host_archive_out_directory}:{clp_config.archive_output.directory}') + clp_execution_env_startup_cmd.append(clp_execution_env_container) + subprocess.run(clp_execution_env_startup_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + try: + # Copy config file into container + copy_cmd = ['docker', 'cp', host_config_file_path, f'{container_name}:{container_config_file_path}'] + subprocess.run(copy_cmd) + + docker_exec_cmd = [ + 'docker', 'exec', + '--workdir', str(docker_clp_home), + container_name, + 'sbin/native/decompress', '--config', container_config_file_path, + '-d', str(docker_extraction_dir) + ] + for path in parsed_args.paths: + docker_exec_cmd.append(path) + temporary_files_to_decompress_path = None + if files_to_decompress_path: + # Copy list to logs directory + temp_list_name = f'{uuid.uuid4()}-decompress-paths.txt' + temporary_files_to_decompress_path = host_log_directory / temp_list_name + shutil.copyfile(files_to_decompress_path, temporary_files_to_decompress_path) + + docker_exec_cmd.append('--files-from') + docker_exec_cmd.append(pathlib.Path(clp_config.logs_directory) / temp_list_name) + logging.info(docker_exec_cmd) + subprocess.run(docker_exec_cmd) + if files_to_decompress_path: + temporary_files_to_decompress_path.unlink() + finally: + docker_stop_cmd = ['docker', 'stop', container_name] + subprocess.run(docker_stop_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + return 0 + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/components/package-template/src/sbin/native/compress b/components/package-template/src/sbin/native/compress new file mode 100755 index 000000000..48aabd0d4 --- /dev/null +++ b/components/package-template/src/sbin/native/compress @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +import argparse +import logging +import os +import pathlib +import shutil +import sys +import uuid + +# Setup logging +# Create logger +logger = logging.getLogger('compress') +logger.setLevel(logging.INFO) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s") +logging_console_handler.setFormatter(logging_formatter) +logger.addHandler(logging_console_handler) + + +def get_clp_home(): + clp_home = None + if 'CLP_HOME' in os.environ: + clp_home = pathlib.Path(os.environ['CLP_HOME']) + else: + for path in pathlib.Path(__file__).resolve().parents: + if 'sbin' == path.name: + clp_home = path.parent + break + + if clp_home is None: + logging.error('CLP_HOME is not set and could not be determined automatically.') + return None + elif not clp_home.exists(): + logger.error('CLP_HOME does not exist.') + return None + + return clp_home.resolve() + + +def load_bundled_python_lib_path(clp_home): + python_site_packages_path = clp_home / 'lib' / 'python3' / 'site-packages' + if not python_site_packages_path.is_dir(): + logger.error('Failed to load python3 packages bundled with CLP.') + return -1 + # Add packages to the front of the path + sys.path.insert(0, str(python_site_packages_path)) + + +clp_home = get_clp_home() +if clp_home is None: + sys.exit(-1) +load_bundled_python_lib_path(clp_home) + +from pydantic import ValidationError +from clp.package_utils import make_config_path_absolute +from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.clp_io_config import InputConfig, OutputConfig, ClpIoConfig +from clp_py_utils.core import read_yaml_config_file +from clp_py_utils.sql_adapter import SQL_Adapter +from compression_job_handler.compression_job_handler import handle_jobs + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Compress log files.') + args_parser.add_argument('--config', '-c', required=True, help='CLP configuration file.') + args_parser.add_argument('paths', metavar='PATH', nargs='*', help='Paths to compress.') + args_parser.add_argument('-f', '--input-list', dest='input_list', help='A file listing all paths to compress.') + args_parser.add_argument('--remove-path-prefix', metavar='DIR', + help='Remove the given path prefix from each compressed file/dir.') + args_parser.add_argument('--no-progress-reporting', action='store_true', help='Disables progress reporting.') + parsed_args = args_parser.parse_args(argv[1:]) + + # Validate some input paths were specified + if parsed_args.input_list is None and len(parsed_args.paths) == 0: + args_parser.error('No paths specified.') + + # Validate paths were specified using only one method + if len(parsed_args.paths) > 0 and parsed_args.input_list is not None: + args_parser.error('Paths cannot be specified on the command line AND through a file.') + + # Load configuration + clp_config_file_path = pathlib.Path(parsed_args.config) + try: + clp_config = CLPConfig.parse_obj(read_yaml_config_file(clp_config_file_path)) + except ValidationError as err: + logger.error(err) + except FileNotFoundError as err: + logger.error(f'CLP config file not found at "{str(clp_config_file_path)}"') + except Exception as ex: + logger.error(ex) + else: + + logs_dir = make_config_path_absolute(clp_home, pathlib.Path(clp_config.logs_directory)) + + comp_jobs_dir = logs_dir / 'comp-jobs' + comp_jobs_dir.mkdir(parents=True, exist_ok=True) + + if parsed_args.input_list is None: + # Write paths to file + log_list_path = comp_jobs_dir / f'{str(uuid.uuid4())}.txt' + with open(log_list_path, 'w') as f: + for path in parsed_args.paths: + stripped_path = path.strip() + if '' == stripped_path: + # Skip empty paths + continue + resolved_path = pathlib.Path(stripped_path).resolve() + + f.write(str(resolved_path) + '\n') + else: + # Copy to jobs directory + log_list_path = pathlib.Path(parsed_args.input_list).resolve() + shutil.copy(log_list_path, comp_jobs_dir / log_list_path.name) + + logger.info(f'Compression job submitted to compression-job-handler.') + + mysql_adapter = SQL_Adapter(clp_config.database) + clp_io_config = ClpIoConfig( + input=InputConfig(type='fs', list_path=str(log_list_path)), + output=OutputConfig.parse_obj(clp_config.archive_output) + ) + + # Execute compression-job-handler.handle_jobs + logs_directory_abs = str(pathlib.Path(clp_config.logs_directory).resolve()) + handle_jobs(sql_adapter=mysql_adapter, clp_io_config=clp_io_config, logs_dir_abs=logs_directory_abs, + fs_logs_required_parent_dir=pathlib.Path(clp_config.input_logs_dfs_path), + no_progress_reporting=parsed_args.no_progress_reporting) + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/components/package-template/src/sbin/native/decompress b/components/package-template/src/sbin/native/decompress new file mode 100755 index 000000000..02129a728 --- /dev/null +++ b/components/package-template/src/sbin/native/decompress @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +import argparse +import logging +import os +import pathlib +import subprocess +import sys +import uuid + +# Setup logging +# Create logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(name)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +logger.addHandler(logging_console_handler) + + +def get_clp_home(): + clp_home = None + if 'CLP_HOME' in os.environ: + clp_home = pathlib.Path(os.environ['CLP_HOME']) + else: + for path in pathlib.Path(__file__).resolve().parents: + if 'sbin' == path.name: + clp_home = path.parent + break + + if clp_home is None: + logger.error('CLP_HOME is not set and could not be determined automatically.') + return None + elif not clp_home.exists(): + logger.error('CLP_HOME does not exist.') + return None + + return clp_home.resolve() + + +def load_bundled_python_lib_path(clp_home): + python_site_packages_path = clp_home / 'lib' / 'python3' / 'site-packages' + if not python_site_packages_path.is_dir(): + logger.error('Failed to load python3 packages bundled with CLP.') + sys.exit(-1) + # Add packages to the front of the path + sys.path.insert(0, str(python_site_packages_path)) + + +clp_home = get_clp_home() +if clp_home is None: + sys.exit(-1) +load_bundled_python_lib_path(clp_home) + +from clp.package_utils import make_config_path_absolute +from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.core import read_yaml_config_file +import yaml + + +def decompress_paths(paths, list_path: pathlib.Path, clp_config: CLPConfig, + archives_dir: pathlib.Path, logs_dir: pathlib.Path, + extraction_dir: pathlib.Path): + # Generate database config file for clp + db_config_file_path = logs_dir / f'decompress-db-config-{uuid.uuid4()}.yml' + db_config_file = open(db_config_file_path, 'w') + yaml.safe_dump(clp_config.database.get_clp_connection_params_and_type(), db_config_file) + db_config_file.close() + + decompression_cmd = [ + str(clp_home / 'bin' / 'clp'), + 'x', str(archives_dir), str(extraction_dir), + '--db-config-file', str(db_config_file_path), + ] + files_to_decompress_list_path = None + if list_path is not None: + decompression_cmd.append("-f") + decompression_cmd.append(str(list_path)) + elif len(paths) > 0: + # Write paths to file + files_to_decompress_list_path = logs_dir / f'paths-to-decompress-{uuid.uuid4()}.txt' + with open(files_to_decompress_list_path, 'w') as stream: + for path in paths: + stream.write(path + '\n') + + decompression_cmd.append('-f') + decompression_cmd.append(str(files_to_decompress_list_path)) + + proc = subprocess.run(decompression_cmd, close_fds=True) + return_code = proc.returncode + if 0 != return_code: + logger.error(f'Decompression failed, return_code={return_code}') + return return_code + + # Remove generated files + if files_to_decompress_list_path is not None: + # Remove path list + files_to_decompress_list_path.unlink() + db_config_file_path.unlink() + + return 0 + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Decompresses logs') + args_parser.add_argument('--config', '-c', required=True, help='CLP configuration file.') + args_parser.add_argument('paths', metavar='PATH', nargs='*', help='Paths to decompress.') + args_parser.add_argument('-f', '--files-from', help='Decompress all paths in the given list.') + args_parser.add_argument('-d', '--extraction-dir', metavar='DIR', help='Decompress files into DIR', default='.') + parsed_args = args_parser.parse_args(argv[1:]) + + # Validate paths were specified using only one method + if len(parsed_args.paths) > 0 and parsed_args.files_from is not None: + args_parser.error('Paths cannot be specified both on the command line and through a file.') + return -1 + + # Validate extraction directory + extraction_dir = pathlib.Path(parsed_args.extraction_dir) + if not extraction_dir.is_dir(): + logger.error(f'extraction-dir ({extraction_dir}) is not a valid directory.') + return -1 + + # Load configuration + clp_config_file_path = pathlib.Path(parsed_args.config) + try: + clp_config = CLPConfig.parse_obj(read_yaml_config_file(clp_config_file_path)) + except FileNotFoundError: + logger.error(f'CLP config file not found at "{clp_config_file_path}"') + return -1 + except Exception as ex: + logger.error(ex) + return -1 + + logs_dir = make_config_path_absolute(clp_home, pathlib.Path(clp_config.logs_directory)) + archives_dir = make_config_path_absolute(clp_home, pathlib.Path(clp_config.archive_output.directory)) + + return_code = \ + decompress_paths(parsed_args.paths, parsed_args.files_from, clp_config, archives_dir, logs_dir, extraction_dir) + return return_code + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/components/package-template/src/sbin/native/search b/components/package-template/src/sbin/native/search new file mode 100755 index 000000000..8b1c49bd4 --- /dev/null +++ b/components/package-template/src/sbin/native/search @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +import argparse +import logging +import os +import pathlib +import subprocess +import sys +import uuid + +# Setup logging +# Create logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(name)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +logger.addHandler(logging_console_handler) + + +def get_clp_home(): + clp_home = None + if 'CLP_HOME' in os.environ: + clp_home = pathlib.Path(os.environ['CLP_HOME']) + else: + for path in pathlib.Path(__file__).resolve().parents: + if 'sbin' == path.name: + clp_home = path.parent + break + + if clp_home is None: + logger.error('CLP_HOME is not set and could not be determined automatically.') + return None + elif not clp_home.exists(): + logger.error('CLP_HOME does not exist.') + return None + + return clp_home.resolve() + + +def load_bundled_python_lib_path(clp_home): + python_site_packages_path = clp_home / 'lib' / 'python3' / 'site-packages' + if not python_site_packages_path.is_dir(): + logger.error('Failed to load python3 packages bundled with CLP.') + sys.exit(-1) + # Add packages to the front of the path + sys.path.insert(0, str(python_site_packages_path)) + + +clp_home = get_clp_home() +if clp_home is None: + sys.exit(-1) +load_bundled_python_lib_path(clp_home) + +from clp.package_utils import make_config_path_absolute +from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.core import read_yaml_config_file +import yaml + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Searches the compressed logs.') + args_parser.add_argument('--config', '-c', required=True, help='CLP configuration file.') + args_parser.add_argument('wildcard_query', help='Wildcard query.') + args_parser.add_argument("--file-path", help="File to search.") + parsed_args = args_parser.parse_args(argv[1:]) + + # Load configuration + clp_config_file_path = pathlib.Path(parsed_args.config) + try: + clp_config = CLPConfig.parse_obj(read_yaml_config_file(clp_config_file_path)) + except FileNotFoundError: + logger.error(f'CLP config file not found at "{clp_config_file_path}"') + return -1 + except Exception as ex: + logger.error(ex) + return -1 + + logs_dir = make_config_path_absolute(clp_home, pathlib.Path(clp_config.logs_directory)) + archives_dir = make_config_path_absolute(clp_home, + pathlib.Path(clp_config.archive_output.directory)) + + # Generate database config file for clp + db_config_file_path = logs_dir / f'decompress-db-config-{uuid.uuid4()}.yml' + db_config_file = open(db_config_file_path, 'w') + yaml.safe_dump(clp_config.database.get_clp_connection_params_and_type(), db_config_file) + db_config_file.close() + + search_cmd = [ + str(clp_home / 'bin' / 'clg'), + str(archives_dir), parsed_args.wildcard_query, + '--db-config-file', str(db_config_file_path), + ] + if parsed_args.file_path is not None: + search_cmd.append(parsed_args.file_path) + + proc = subprocess.run(search_cmd, close_fds=True) + return_code = proc.returncode + if 0 != return_code: + logger.error(f'Search failed, return_code={return_code}') + return return_code + + # Remove generated files + db_config_file_path.unlink() + + return return_code + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/components/package-template/src/sbin/search b/components/package-template/src/sbin/search new file mode 100755 index 000000000..51bd3110c --- /dev/null +++ b/components/package-template/src/sbin/search @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +import argparse +import logging +import os +import pathlib +import subprocess +import sys + +# Setup logging +# Create logger +log = logging.getLogger('clp') +log.setLevel(logging.DEBUG) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(name)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +log.addHandler(logging_console_handler) + + +def get_clp_home(): + clp_home = None + if 'CLP_HOME' in os.environ: + clp_home = pathlib.Path(os.environ['CLP_HOME']) + else: + for path in pathlib.Path(__file__).resolve().parents: + if 'sbin' == path.name: + clp_home = path.parent + break + + if clp_home is None: + log.error('CLP_HOME is not set and could not be determined automatically.') + return None + elif not clp_home.exists(): + log.error('CLP_HOME does not exist.') + return None + + return clp_home.resolve() + + +def load_bundled_python_lib_path(clp_home): + python_site_packages_path = clp_home / 'lib' / 'python3' / 'site-packages' + if not python_site_packages_path.is_dir(): + log.error('Failed to load python3 packages bundled with CLP.') + return -1 + # Add packages to the front of the path + sys.path.insert(0, str(python_site_packages_path)) + + +clp_home = get_clp_home() +if clp_home is None: + sys.exit(-1) +load_bundled_python_lib_path(clp_home) + +from clp.package_utils import check_env +from clp_py_utils.core import read_yaml_config_file +from clp_py_utils.clp_package_config import CLPPackageConfig +from pydantic import ValidationError + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Searches the compressed logs.') + args_parser.add_argument('--config', '-c', type=str, help='CLP package configuration file.') + args_parser.add_argument('wildcard_query', help="Wildcard query.") + args_parser.add_argument('--file-path', help="File to search.") + parsed_args = args_parser.parse_args(argv[1:]) + + # Infer config file path + try: + if not parsed_args.config: + # Did not provide a config file + default_clp_package_config_file = clp_home / 'etc' / 'clp-config.yaml' + if not default_clp_package_config_file.exists(): + raise FileNotFoundError + log.debug(f'Using default config file at {default_clp_package_config_file.relative_to(pathlib.Path.cwd())}') + package_config_file_path = default_clp_package_config_file + else: + # Provided a config file + package_config_file_path = pathlib.Path(parsed_args.config).resolve(strict=True) + except FileNotFoundError: + log.error('Did not provide a clp package config file or the specified config file does not exist.') + return + + try: + clp_package_config = CLPPackageConfig.parse_obj(read_yaml_config_file(package_config_file_path)) + except ValidationError as err: + log.error(err) + return + except Exception as ex: + # read_yaml_config_file already logs the parsing error inside + return + + clp_cluster_name = clp_package_config.cluster_name + try: + check_env(clp_cluster_name) + except EnvironmentError as ex: + logging.error(ex) + return -1 + + docker_exec_cmd = [ + 'docker', 'exec', + '--workdir', '/root/clp', + clp_cluster_name, + 'sbin/native/search', '--config', f'/root/.{clp_cluster_name}.yaml', + parsed_args.wildcard_query + ] + if parsed_args.file_path: + docker_exec_cmd.append('--file-path') + docker_exec_cmd.append(parsed_args.file_path) + subprocess.run(docker_exec_cmd) + + return 0 + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/components/package-template/src/sbin/start-clp b/components/package-template/src/sbin/start-clp new file mode 100755 index 000000000..2ee3da202 --- /dev/null +++ b/components/package-template/src/sbin/start-clp @@ -0,0 +1,425 @@ +#!/usr/bin/env python3 +import argparse +import logging +import multiprocessing +import os +import pathlib +import secrets +import socket +import subprocess +import sys +import time + +# Setup logging +# Create logger +log = logging.getLogger('clp') +log.setLevel(logging.INFO) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(name)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +log.addHandler(logging_console_handler) + + +def get_clp_home(): + clp_home = None + if 'CLP_HOME' in os.environ: + clp_home = pathlib.Path(os.environ['CLP_HOME']) + else: + for path in pathlib.Path(__file__).resolve().parents: + if 'sbin' == path.name: + clp_home = path.parent + break + + if clp_home is None: + log.error('CLP_HOME is not set and could not be determined automatically.') + return None + elif not clp_home.exists(): + log.error('CLP_HOME does not exist.') + return None + + return clp_home.resolve() + + +def load_bundled_python_lib_path(clp_home): + python_site_packages_path = clp_home / 'lib' / 'python3' / 'site-packages' + if not python_site_packages_path.is_dir(): + log.error('Failed to load python3 packages bundled with CLP.') + return -1 + # Add packages to the front of the path + sys.path.insert(0, str(python_site_packages_path)) + + +clp_home = get_clp_home() +if clp_home is None: + sys.exit(-1) +load_bundled_python_lib_path(clp_home) + +from clp.package_utils import prepare_package_and_config, check_dependencies +from clp_py_utils.core import read_yaml_config_file +from clp_py_utils.clp_package_config import CLPPackageConfig, ArchiveOutput as PackageArchiveOutput +from clp_py_utils.clp_config import Database, ArchiveOutput, CLPConfig, Scheduler + + +def provision_docker_network_bridge(clp_cluster_name: str): + cmd = ['docker', 'network', 'create', '--driver', 'bridge', clp_cluster_name] + log.info('Provision docker network bridge') + log.debug(' '.join(cmd)) + try: + subprocess.run(cmd, stdout=subprocess.PIPE, check=True) + except subprocess.CalledProcessError: + log.error(f'Cluster "{clp_cluster_name}" has already been provisioned.') + raise EnvironmentError + + +def start_sql_db(cluster_name: str, clp_config: CLPConfig, host_data_directory: pathlib.Path, publish_ports: bool): + log.info(f'Starting scheduler {clp_config.database.type} database') + + persistent_storage_path = host_data_directory / 'db' + persistent_storage_path.mkdir(exist_ok=True, parents=True) + + database_startup_cmd = [ + 'docker', 'run', '-d', + '--network', cluster_name, + '--hostname', f'{clp_config.database.host}', + '--name', f'{clp_config.database.host}', + '-v', f'{str(persistent_storage_path)}:/var/lib/mysql', + '-e', f'MYSQL_ROOT_PASSWORD={clp_config.database.password}', + '-e', f'MYSQL_USER={clp_config.database.username}', + '-e', f'MYSQL_PASSWORD={clp_config.database.password}', + '-e', f'MYSQL_DATABASE=initial_database' + ] + if publish_ports: + database_startup_cmd.append('-p') + database_startup_cmd.append(f'{str(clp_config.database.port)}:{str(clp_config.database.port)}') + if 'mysql' == clp_config.database.type: + database_startup_cmd.append('mysql:8.0.23') + elif 'mariadb' == clp_config.database.type: + database_startup_cmd.append('mariadb:10.6.4-focal') + log.debug(' '.join(database_startup_cmd)) + try: + subprocess.run(database_startup_cmd, stdout=subprocess.PIPE, check=True) + except subprocess.CalledProcessError: + log.error(f'Unable to start "{clp_config.database.type}" inside docker') + raise EnvironmentError + + +def create_sql_db_tables(cluster_name: str, container_config_file_path: str): + # Initialize database tables + log.info('Initializing scheduler database tables') + database_table_creation_commands = [ + ['python3', '/root/clp/lib/python3/site-packages/clp_py_utils/initialize-clp-metadata-db.py', + '--config', container_config_file_path], + ['python3', '/root/clp/lib/python3/site-packages/clp_py_utils/initialize-orchestration-db.py', + '--config', container_config_file_path] + ] + for command in database_table_creation_commands: + docker_exec_cmd = ['docker', 'exec', '-it', + '-e', 'PYTHONPATH=/root/clp/lib/python3/site-packages', cluster_name] + docker_exec_cmd += command + log.debug(' '.join(docker_exec_cmd)) + max_attempts = 20 + for attempt in range(max_attempts + 1): + if attempt == max_attempts: + log.error('Unable to connect to the database with the provided credentials') + raise EnvironmentError + try: + subprocess.run(docker_exec_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + except subprocess.CalledProcessError: + log.debug('Waiting for database to be ready') + time.sleep(1) # database not ready + else: + break + log.debug('Scheduler database tables initialization completed') + + +def provision_rabbitmq(cluster_name: str, clp_config: CLPConfig): + log.info('Starting scheduler queue') + + # Start rabbitmq + docker_exec_cmd = ['docker', 'exec', '-d', '-e', 'RABBITMQ_PID_FILE=/tmp/rabbitmq.pid', cluster_name, + 'rabbitmq-server'] + log.debug(' '.join(docker_exec_cmd)) + try: + subprocess.run(docker_exec_cmd, stdout=subprocess.PIPE, check=True) + except subprocess.CalledProcessError: + log.error(f'Unable to start rabbitmq inside docker') + raise EnvironmentError + + # Wait for rabbitmq to be available + docker_exec_cmd = ['docker', 'exec', '-e', 'RABBITMQ_PID_FILE=/tmp/rabbitmq.pid', cluster_name] + \ + 'rabbitmqctl wait ${RABBITMQ_PID_FILE}'.split() + log.debug(' '.join(docker_exec_cmd)) + subprocess.run(docker_exec_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Initialize rabbitmq + log.info('Initializing scheduler queue') + rabbitmq_provisioning_commands = [ + f'rabbitmqctl add_user {clp_config.scheduler.username} {clp_config.scheduler.password}', + f'rabbitmqctl set_user_tags {clp_config.scheduler.username} administrator', + f'rabbitmqctl set_permissions -p / {clp_config.scheduler.username} .* .* .*' + ] + for command in rabbitmq_provisioning_commands: + docker_exec_cmd = ['docker', 'exec', '-e', 'RABBITMQ_PID_FILE=/tmp/rabbitmq.pid', + cluster_name] + command.split() + log.debug(' '.join(docker_exec_cmd)) + proc = subprocess.run(docker_exec_cmd, stdout=subprocess.PIPE) + log.debug(proc.stdout.decode('utf-8')) + + +def start_scheduler(cluster_name: str, clp_config: CLPConfig, container_config_file_path: str): + scheduler_startup_cmd = ['python3', '-u', '-m', 'job_orchestration.scheduler.scheduler', + '--config', container_config_file_path] + log.info('Starting scheduler service') + docker_exec_cmd = [ + 'docker', 'exec', '--detach', '--workdir', '/root/clp', + '-e', 'PYTHONPATH=/root/clp/lib/python3/site-packages', + '-e', f'BROKER_URL=amqp://{clp_config.scheduler.username}:{clp_config.scheduler.password}@localhost:5672', + # rabbitmq runs on scheduler node + cluster_name + ] + docker_exec_cmd += scheduler_startup_cmd + log.debug(docker_exec_cmd) + try: + subprocess.run(docker_exec_cmd) + except subprocess.CalledProcessError: + log.error('Failed to start clp scheduler service') + raise EnvironmentError + + +def start_worker(cluster_name: str, clp_config: CLPConfig, num_cpus: int): + worker_startup_cmd = ['/root/clp/bin/celery', '-A', 'job_orchestration.executor', 'worker', + '--concurrency', str(num_cpus), + '--loglevel', 'WARNING', + '-Q', 'compression'] + log.info("Starting CLP worker") + docker_exec_cmd = [ + 'docker', 'exec', '--detach', + '--workdir', '/root/clp', + '-e', 'CLP_HOME=/root/clp', + '-e', f'CLP_DATA_DIR={clp_config.data_directory}', + '-e', f'CLP_LOGS_DIR={clp_config.logs_directory}', + '-e', 'PYTHONPATH=/root/clp/lib/python3/site-packages', + '-e', f'BROKER_URL=amqp://{clp_config.scheduler.username}:{clp_config.scheduler.password}' + f'@{clp_config.scheduler.host}:5672', + '-e', f'RESULT_BACKEND=rpc://{clp_config.scheduler.username}:{clp_config.scheduler.password}' + f'@{clp_config.scheduler.host}:5672', + cluster_name + ] + docker_exec_cmd += worker_startup_cmd + log.debug(docker_exec_cmd) + try: + subprocess.run(docker_exec_cmd) + except subprocess.CalledProcessError: + log.error('Failed to start CLP worker') + raise EnvironmentError + + +def generate_default_package_config(package_config_file_path: pathlib.Path): + clp_package_config = CLPPackageConfig( + cluster_name='clp-mini-cluster', + archive_output=PackageArchiveOutput( + target_archive_size=268435456, # 256MB + target_dictionaries_size=33554432, # 32MB + target_encoded_file_size=268435456, # 256MB + target_segment_size=268435456 # 256MB + ) + ) + with open(package_config_file_path, 'w') as config_file: + config_file.write(clp_package_config.generate_package_config_file_content_with_comments()) + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Startup script for CLP') + args_parser.add_argument('--uncompressed-logs-dir', type=str, required=True, + help='The directory containing uncompressed logs.') + args_parser.add_argument('--config', '-c', type=str, help='CLP package configuration file.') + args_parser.add_argument('--num-cpus', type=int, default=0, + help='Number of logical CPU cores to use for compression') + args_parser.add_argument('--publish-ports', action='store_true', help='Publish container ports to the host port') + args_parser.add_argument('--start-scheduler-only', action='store_true', help='Start only scheduler service') + args_parser.add_argument('--start-worker-only', action='store_true', help='Start only worker service') + + parsed_args = args_parser.parse_args(argv[1:]) + + # Check required system dependencies + try: + check_dependencies() + except EnvironmentError as ex: + log.error(ex) + return + + # Infer components to enable + startup_component_count = parsed_args.start_scheduler_only + parsed_args.start_worker_only + if startup_component_count > 1: + log.error('--start-scheduler-only and --start-worker-only are mutually exclusive') + return + if not parsed_args.start_scheduler_only and not parsed_args.start_worker_only: + need_to_start_scheduler = True + need_to_start_worker = True + else: + need_to_start_scheduler = parsed_args.start_scheduler_only + need_to_start_worker = parsed_args.start_worker_only + + + # Infer number of CPU cores used for compression + num_cpus = parsed_args.num_cpus + if 0 == num_cpus: + num_cpus = multiprocessing.cpu_count() + + # Validate uncompressed-log-dir + uncompressed_log_dir = pathlib.Path(parsed_args.uncompressed_logs_dir).resolve() + if not (uncompressed_log_dir.exists() and uncompressed_log_dir.is_dir()): + log.error(f'The specified uncompressed log directory path is invalid: {uncompressed_log_dir}') + return + + # Infer config file path + try: + if not parsed_args.config: + # Did not provide a config file + default_clp_package_config_file = clp_home / 'etc' / 'clp-config.yaml' + if not default_clp_package_config_file.exists(): + log.info('Generating a default config file.') + generate_default_package_config(default_clp_package_config_file) + log.info(f'Using default config file at {default_clp_package_config_file.relative_to(pathlib.Path.cwd())}') + package_config_file_path = default_clp_package_config_file + else: + # Provided a config file + package_config_file_path = pathlib.Path(parsed_args.config).resolve(strict=True) + except FileNotFoundError: + log.error('Did not provide a clp package config file or the specified config file does not exist.') + return + + # Parse and validate config file path + try: + clp_package_config = CLPPackageConfig.parse_obj(read_yaml_config_file(package_config_file_path)) + + if need_to_start_scheduler: + # Generate a clp config from a clp package config (a reduced set of clp config) + # This config file will be used to start CLP + clp_config = CLPConfig( + input_logs_dfs_path=str(uncompressed_log_dir), + database=Database( + type='mariadb', + host=f'{clp_package_config.cluster_name}-db', + port=3306, + username='clp-user', + password=f'clp-{secrets.token_urlsafe(8)}', + name='initial_database' + ), + scheduler=Scheduler( + host=f'{clp_package_config.cluster_name}', + username='clp-user', + password=f'clp-{secrets.token_urlsafe(8)}', + jobs_poll_delay=1 + ), + archive_output=ArchiveOutput( + type='fs', + directory=f'var/data/{clp_package_config.cluster_name}/archives', + storage_is_node_specific=True, + target_archive_size=clp_package_config.archive_output.target_archive_size, + target_dictionaries_size=clp_package_config.archive_output.target_dictionaries_size, + target_encoded_file_size=clp_package_config.archive_output.target_encoded_file_size, + target_segment_size=clp_package_config.archive_output.target_segment_size + ), + data_directory=f'var/data/{clp_package_config.cluster_name}', + logs_directory=f'var/log/{clp_package_config.cluster_name}' + ) + + # If ports are published, user wants to run CLP in distributed mode + # Host parameter will be the "host"'s hostname instead of docker network hostname + if parsed_args.publish_ports: + host_hostname = socket.gethostname() + clp_config.database.host = host_hostname + clp_config.scheduler.host = host_hostname + except Exception as ex: + log.error(ex) + return + + try: + # Create temporary clp config file which we mount into the container + # Prepare package and initialize all required directories if necessary + # Note: config file is also updated with absolute path + docker_clp_home = pathlib.Path('/') / 'root' / 'clp' + container_clp_config_file_name = f'.{clp_package_config.cluster_name}.yaml' + host_config_file_path = clp_home / container_clp_config_file_name + container_config_file_path = f'/root/{container_clp_config_file_name}' + + # Persist config file used for container + if not host_config_file_path.exists() and need_to_start_scheduler: + host_data_directory, host_log_directory, host_archive_out_directory, clp_config = \ + prepare_package_and_config(clp_config, clp_home, docker_clp_home) + with open(host_config_file_path, 'w') as config_file: + config_file.write(clp_config.generate_config_file_content_with_comments()) + else: + try: + clp_config = CLPConfig.parse_obj(read_yaml_config_file(host_config_file_path)) + host_data_directory = clp_home / pathlib.Path(clp_config.data_directory).relative_to(docker_clp_home) + host_log_directory = clp_home / pathlib.Path(clp_config.logs_directory).relative_to(docker_clp_home) + host_archive_out_directory = \ + clp_home / pathlib.Path(clp_config.archive_output.directory).relative_to(docker_clp_home) + except Exception as ex: + log.error(ex) + return + + # Setup basic networking infrastructure + provision_docker_network_bridge(clp_package_config.cluster_name) + + if need_to_start_scheduler: + # Optimize, start database as early as possible (slow process) + log.info('Starting CLP scheduler') + log.debug('Starting CLP scheduler database service') + start_sql_db(clp_package_config.cluster_name, clp_config, host_data_directory, parsed_args.publish_ports) + + # Start execution environment + clp_execution_env_container = 'whywhywhywhywhywhy/clp-execution-env:x86-ubuntu-focal-20210919' + clp_execution_env_startup_cmd = [ + 'docker', 'run', '-di', + '--network', clp_package_config.cluster_name, + '--hostname', f'{clp_package_config.cluster_name}', + '--name', f'{clp_package_config.cluster_name}', + '-v', f'{clp_home}:/root/clp', + '-v', f'{uncompressed_log_dir}:{uncompressed_log_dir}' + ] + if parsed_args.publish_ports: + ports_to_publish = [ + '-p', '5672:5672' # Rabbitmq + ] + clp_execution_env_startup_cmd += ports_to_publish + + # Mount data, logs, archive output directory if it is outside of the package + if not clp_config.data_directory.startswith('/root/clp'): + clp_execution_env_startup_cmd.append('-v') + clp_execution_env_startup_cmd.append(f'{host_data_directory}:{clp_config.data_directory}') + if not clp_config.logs_directory.startswith('/root/clp'): + clp_execution_env_startup_cmd.append('-v') + clp_execution_env_startup_cmd.append(f'{host_log_directory}:{clp_config.logs_directory}') + if not clp_config.archive_output.directory.startswith('/root/clp'): + clp_execution_env_startup_cmd.append('-v') + clp_execution_env_startup_cmd.append(f'{host_archive_out_directory}:{clp_config.archive_output.directory}') + clp_execution_env_startup_cmd.append(clp_execution_env_container) + log.debug(' '.join(clp_execution_env_startup_cmd)) + subprocess.run(clp_execution_env_startup_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True) + + # Copy config file into container + copy_cmd = ['docker', 'cp', str(host_config_file_path), + f'{clp_package_config.cluster_name}:{container_config_file_path}'] + log.debug(' '.join(copy_cmd)) + subprocess.run(copy_cmd) + + if need_to_start_scheduler: + provision_rabbitmq(clp_package_config.cluster_name, clp_config) + create_sql_db_tables(clp_package_config.cluster_name, container_config_file_path) + start_scheduler(clp_package_config.cluster_name, clp_config, container_config_file_path) + if need_to_start_worker: + start_worker(clp_package_config.cluster_name, clp_config, num_cpus) + except subprocess.CalledProcessError as ex: + log.error(ex.stdout.decode('utf-8')) + log.error(f'Failed to provision "{clp_package_config.cluster_name}"') + except EnvironmentError as ex: + log.error(ex) + log.error(f'Failed to provision "{clp_package_config.cluster_name}"') + + +if '__main__' == __name__: + main(sys.argv) diff --git a/components/package-template/src/sbin/stop-clp b/components/package-template/src/sbin/stop-clp new file mode 100755 index 000000000..0abbf493c --- /dev/null +++ b/components/package-template/src/sbin/stop-clp @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 + +import argparse +import json +import logging +import os +import pathlib +import subprocess +import sys + +# Setup logging +# Create logger +log = logging.getLogger('clp') +log.setLevel(logging.INFO) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(name)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +log.addHandler(logging_console_handler) + + +def get_clp_home(): + clp_home = None + if 'CLP_HOME' in os.environ: + clp_home = pathlib.Path(os.environ['CLP_HOME']) + else: + for path in pathlib.Path(__file__).resolve().parents: + if 'sbin' == path.name: + clp_home = path.parent + break + + if clp_home is None: + log.error('CLP_HOME is not set and could not be determined automatically.') + return None + elif not clp_home.exists(): + log.error('CLP_HOME does not exist.') + return None + + return clp_home.resolve() + + +def load_bundled_python_lib_path(clp_home): + python_site_packages_path = clp_home / 'lib' / 'python3' / 'site-packages' + if not python_site_packages_path.is_dir(): + log.error('Failed to load python3 packages bundled with CLP.') + return -1 + # Add packages to the front of the path + sys.path.insert(0, str(python_site_packages_path)) + + +clp_home = get_clp_home() +if clp_home is None: + sys.exit(-1) +load_bundled_python_lib_path(clp_home) + +from clp_py_utils.core import read_yaml_config_file +from clp_py_utils.clp_package_config import CLPPackageConfig +from pydantic import ValidationError + + +def inspect_docker_network_bridge(clp_cluster_name: str): + cmd = ['docker', 'network', 'inspect', clp_cluster_name] + log.info('Inspecting docker network bridge') + log.debug(' '.join(cmd)) + proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if 0 != proc.returncode: + log.error(f'Cluster "{clp_cluster_name}" has not been provisioned.') + raise EnvironmentError + + bridge_bridge_specification = json.loads(proc.stdout.decode('utf-8'))[0] + + return bridge_bridge_specification + + +def main(argv): + args_parser = argparse.ArgumentParser(description='Startup script for CLP') + args_parser.add_argument('--config', '-c', type=str, help='CLP package configuration file.') + parsed_args = args_parser.parse_args(argv[1:]) + + # Infer config file path + try: + if not parsed_args.config: + # Did not provide a config file + default_clp_package_config_file = clp_home / 'etc' / 'clp-config.yaml' + if not default_clp_package_config_file.exists(): + raise FileNotFoundError + log.info(f'Using default config file at {default_clp_package_config_file.relative_to(pathlib.Path.cwd())}') + package_config_file_path = default_clp_package_config_file + else: + # Provided a config file + package_config_file_path = pathlib.Path(parsed_args.config).resolve(strict=True) + except FileNotFoundError: + log.error('Did not provide a clp package config file or the specified config file does not exist.') + return + + try: + clp_package_config = CLPPackageConfig.parse_obj(read_yaml_config_file(package_config_file_path)) + except ValidationError as err: + log.error(err) + return + except Exception as ex: + # read_yaml_config_file already logs the parsing error inside + return + + clp_cluster_name = clp_package_config.cluster_name + try: + bridge_bridge_specification = inspect_docker_network_bridge(clp_cluster_name) + for container_id in bridge_bridge_specification['Containers']: + # Stop and remove container + log.debug(f'Removing container {container_id}') + subprocess.run(['docker', 'stop', container_id], stdout=subprocess.DEVNULL) + subprocess.run(['docker', 'rm', container_id], stdout=subprocess.DEVNULL) + log.debug(f'Removing docker network bridge {clp_cluster_name}') + subprocess.run(['docker', 'network', 'rm', clp_cluster_name], stdout=subprocess.DEVNULL) + except EnvironmentError: + log.error(f'Failed to decommission "{clp_cluster_name}"') + else: + log.info(f'Successfully decommissioned "{clp_cluster_name}"') + + +if '__main__' == __name__: + main(sys.argv) diff --git a/config/build-clp-package.yaml b/config/build-clp-package.yaml new file mode 100644 index 000000000..52b11a5f5 --- /dev/null +++ b/config/build-clp-package.yaml @@ -0,0 +1,16 @@ +working_dir: out +version: 0.0.0 +artifact_name: clp-package-ubuntu-focal +build_parallelism: 0 # 0 -> attempt to compile with available cores on the machine +builder_dockerhub_image: whywhywhywhywhywhy/builders:x86-ubuntu-focal-20210915 +components: + - name: package-template + type: local + - name: compression-job-handler + type: local + - name: job-orchestration + type: local + - name: clp-py-utils + type: local + - name: core + type: local \ No newline at end of file diff --git a/tools/packager/.gitignore b/tools/packager/.gitignore new file mode 100644 index 000000000..9fb18b426 --- /dev/null +++ b/tools/packager/.gitignore @@ -0,0 +1,2 @@ +.idea +out diff --git a/tools/packager/LICENSE b/tools/packager/LICENSE new file mode 100644 index 000000000..7a4a3ea24 --- /dev/null +++ b/tools/packager/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/tools/packager/README.md b/tools/packager/README.md new file mode 100644 index 000000000..ab8a99603 --- /dev/null +++ b/tools/packager/README.md @@ -0,0 +1,38 @@ +# Packager + +Packager is a tool for generating a runnable CLP package by automatically downloading CLP's source, +compiling, and bundling it. + +## Requirements + +* 10GB of disk space +* At least 2GB of RAM +* An active internet connection +* [Docker](https://docs.docker.com/engine/install/) + * `docker` should be in the user's path, and + * [runnable without superuser privileges](https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user) + (without sudo) +* `python3`, `pip`, and `git` pre-installed and available on the user's path + * For systems with a Python version < 3.7, run `pip3 install -r requirements-pre-3.7.txt` + +## Building the package + +```bash +pip3 install -r requirements.txt +python3 build-clp-package.py --config ../../config/build-clp-package.yaml +``` + +* After a successful build, the package will be available at `out/.tar.gz`. +* The package README.md is copied from [package-template](../../components/package-template). + + +## Troubleshooting + +### ModuleNotFoundError + +**Error message**: ```ModuleNotFoundError: No module named 'dataclasses'``` + +**Cause**: When starting the package on some older platforms like Ubuntu 18.04, some required Python modules are not in +the standard library + +**Solution**: `pip install -r requirements-pre-3.7.txt` diff --git a/tools/packager/build-clp-package.py b/tools/packager/build-clp-package.py new file mode 100644 index 000000000..99dbbc9ac --- /dev/null +++ b/tools/packager/build-clp-package.py @@ -0,0 +1,266 @@ +import argparse +import logging +import os +import pathlib +import platform +import shutil +import subprocess +import sys +import typing +import uuid +from concurrent.futures import ProcessPoolExecutor + +import psutil +import yaml +from pydantic import BaseModel, validator + +# Setup logging +# Create logger +log = logging.getLogger('build-clp-package') +log.setLevel(logging.INFO) +# Setup console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(name)s] %(message)s') +logging_console_handler.setFormatter(logging_formatter) +log.addHandler(logging_console_handler) + + +class ClpComponent(BaseModel): + name: str + type: str + url: str = None + branch: str = None + commit: str = None + + @validator('name', always=True) + def component_name_validation(cls, v): + currently_supported_component_names = [ + 'package-template', + 'compression-job-handler', + 'job-orchestration', + 'clp-py-utils', + 'core', + ] + if v not in currently_supported_component_names: + raise ValueError(f'The specified clp component name "{v}" not supported') + return v + + @validator('type', always=True) + def component_type_validation(cls, v, values, **kwargs): + if 'git' == v: + if not values['url']: + raise ValueError('git url must be specified') + parameter_count = int(values['branch']) + int(values['commit']) + if 0 == parameter_count: + raise ValueError('git branch or commit must be specified') + elif 2 == parameter_count: + raise ValueError('can only specify either git branch or commit') + elif 'local' == v: + pass # Nothing needs to be validated + else: + raise ValueError(f'The specified clp component type "{v}" not supported') + return v + + +class PackagingConfig(BaseModel): + working_dir: str + version: str + arch: str = platform.machine() + artifact_name: str + build_parallelism: int + builder_dockerhub_image: str + components: typing.List[ClpComponent] + + +def check_dependencies(): + try: + subprocess.run('command -v git', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True) + except subprocess.CalledProcessError: + log.error('git is not installed on the path.') + raise EnvironmentError + + try: + subprocess.run('command -v docker', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True) + subprocess.run(['docker', 'ps'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True) + except subprocess.CalledProcessError: + log.error('docker is not installed on the path or cannot run without superuser privileges (sudo).') + raise EnvironmentError + + +def replace_clp_core_version(project_dir: pathlib.Path, version: str): + target_replacement_line = 'constexpr char cVersion[] = ' + target_replacement_file_path = project_dir / 'src' / 'version.hpp' + log.info(f'Updating clp core\'s version to {version} in {target_replacement_file_path}') + with open(target_replacement_file_path, 'r') as version_file: + version_file_lines = version_file.readlines() + for idx, line in enumerate(version_file_lines): + if line.startswith(target_replacement_line): + version_file_lines[idx] = f'{target_replacement_line}"{version}";' + break + with open(target_replacement_file_path, 'w') as version_file: + version_file.write('\n'.join(version_file_lines)) + + +def clone_and_checkout(component: ClpComponent, working_dir: pathlib.Path): + if component.branch: + subprocess.run(['git', 'clone', '-b', component.branch, '--depth', '1', component.url, component.name], + cwd=working_dir, check=True) + elif component.commit: + subprocess.run(['git', 'clone', component.url, component.name], cwd=working_dir, check=True) + subprocess.run(['git', 'checkout', component.commit], cwd=working_dir / component.name, check=True) + + +def clone_and_checkout_clp_core(component: ClpComponent, working_dir: pathlib.Path, version: str): + clone_and_checkout(component, working_dir) + + log.info('Downloading clp core\'s submodules...') + subprocess.run(['./download-all.sh'], cwd=working_dir / 'core' / 'tools' / 'scripts' / 'deps-download') + + replace_clp_core_version(working_dir / 'core', version) + + +def main(argv): + args_parser = argparse.ArgumentParser(description='CLP package builder') + args_parser.add_argument('--config', '-c', required=True, help='Build configuration file.') + parsed_args = args_parser.parse_args(argv[1:]) + + try: + check_dependencies() + except EnvironmentError: + log.error('Unmet dependency') + return -1 + + # Parse config file + with open(parsed_args.config, 'r') as config_file: + try: + packaging_config = PackagingConfig.parse_obj(yaml.safe_load(config_file)) + except: + log.exception('Failed to parse config file.') + return -1 + + # Limit maximum build parallelization degree to minimize chance of running out of RAM + # Minimum 2GB per core to ensure successful compilation + if packaging_config.build_parallelism == 0: + build_parallelization = min(int(psutil.virtual_memory().total / (2 * 1024 * 1024 * 1024)), psutil.cpu_count()) + elif packaging_config.build_parallelism > 0: + build_parallelization = int(packaging_config.build_parallelism) + else: + log.error(f'Unsupported build_parallelism: {packaging_config.build_parallelism}') + return -1 + + # Infer install scripts directory + script_dir = pathlib.Path(__file__).parent.resolve() + host_install_scripts_dir = script_dir / 'install-scripts' + container_install_scripts_dir = pathlib.PurePath('/tmp/install-scripts') + + # Remove existing out directory to ensure clean state prior to cloning directories + host_working_dir = pathlib.Path(packaging_config.working_dir).resolve() + try: + shutil.rmtree(host_working_dir) + except FileNotFoundError: + pass + except: + log.exception(f'Failed to clean up working directory: {host_working_dir}') + return -1 + + host_working_dir.mkdir(parents=True, exist_ok=True) + container_working_directory = pathlib.PurePath('/tmp/out') + versioned_artifact_name = f'{packaging_config.artifact_name}-{packaging_config.arch}-v{packaging_config.version}' + artifact_dir = (host_working_dir / versioned_artifact_name).resolve() + + # Download or copy source code to build working directory + project_root = script_dir.parent.parent + with ProcessPoolExecutor() as executor: + for component in packaging_config.components: + if 'git' == component.type: + # For "git" type components, clone and checkout + if 'core' == component.name: + executor.submit(clone_and_checkout_clp_core, component, host_working_dir, packaging_config.version) + else: + executor.submit(clone_and_checkout, component, host_working_dir) + elif 'local' == component.type: + if 'core' == component.name: + log.info('Downloading clp core\'s submodules...') + cwd = project_root / 'components' / 'core' / 'tools' / 'scripts' / 'deps-download' + subprocess.run(['./download-all.sh'], cwd=cwd) + + # For "local" type components, copy + shutil.copytree(project_root / 'components' / component.name, host_working_dir / component.name) + + if 'core' == component.name: + replace_clp_core_version(host_working_dir / component.name, packaging_config.version) + + # Make a copy of package-template/src directory and name it as the {artifact_name}-{version} + shutil.copytree(host_working_dir / 'package-template' / 'src', artifact_dir) + + # Start build environment container + build_environment_container_name = f'clp-builder-{uuid.uuid4()}' + log.info(f'Starting build environment container {build_environment_container_name}') + try: + build_environment_startup_cmd = [ + 'docker', 'run', '-di', + '--name', build_environment_container_name, + '-v', f'{host_working_dir}:{container_working_directory}', + '-v', f'{host_install_scripts_dir}:{container_install_scripts_dir}', + packaging_config.builder_dockerhub_image + ] + subprocess.run(build_environment_startup_cmd, check=True) + + container_exec_prefix = [ + 'docker', 'exec', '-it', + '-e', f'WORKING_DIR={container_working_directory}', + '-e', f'ARTIFACT_NAME={versioned_artifact_name}', + '-e', f'BUILD_PARALLELISM={build_parallelization}', + '-w', str(container_working_directory), + build_environment_container_name + ] + + # Run the component installation scripts + install_cmds = [ + [str(container_install_scripts_dir / 'install-celery.sh')], + [str(container_install_scripts_dir / 'install-python-component.sh'), 'clp-py-utils'], + [str(container_install_scripts_dir / 'install-python-component.sh'), 'compression-job-handler'], + [str(container_install_scripts_dir / 'install-python-component.sh'), 'job-orchestration'], + [str(container_install_scripts_dir / 'install-core.sh')], + ] + for cmd in install_cmds: + container_exec_cmd = container_exec_prefix + cmd + log.info(' '.join(container_exec_cmd)) + subprocess.run(container_exec_cmd, check=True) + + # Set current user as owner of built files and build tar + cmds = [ + f'chown -R {os.getuid()}:{os.getgid()} {container_working_directory}', + f'tar -czf {versioned_artifact_name}.tar.gz {versioned_artifact_name}', + f'chown -R {os.getuid()}:{os.getgid()} {versioned_artifact_name}.tar.gz' + ] + for cmd in cmds: + container_exec_cmd = container_exec_prefix + cmd.split() + subprocess.run(container_exec_cmd, check=True) + except subprocess.CalledProcessError as ex: + print(ex.stdout) + log.error('Failed to build CLP') + except: + log.exception('Failed to build CLP') + finally: + # Cleanup + log.info('Cleaning up') + try: + subprocess.run(['docker', 'rm', '-f', build_environment_container_name], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except: + pass + + # Verify whether artifact is generated + artifact_tarball_path = host_working_dir / f'{versioned_artifact_name}.tar.gz' + if artifact_tarball_path.exists(): + log.info(f'Artifact built successfully: {artifact_tarball_path}') + else: + log.error('Artifact build failure') + return -1 + + return 0 + + +if '__main__' == __name__: + sys.exit(main(sys.argv)) diff --git a/tools/packager/install-scripts/install-celery.sh b/tools/packager/install-scripts/install-celery.sh new file mode 100755 index 000000000..fdf3dd9ef --- /dev/null +++ b/tools/packager/install-scripts/install-celery.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +echo "Installing celery" + +pip3 install celery==5.1.2 + +bin_dir=${WORKING_DIR}/${ARTIFACT_NAME}/bin/ +mkdir -p ${bin_dir} +cp /usr/local/bin/celery ${bin_dir} diff --git a/tools/packager/install-scripts/install-core.sh b/tools/packager/install-scripts/install-core.sh new file mode 100755 index 000000000..2c8256cce --- /dev/null +++ b/tools/packager/install-scripts/install-core.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +echo "Installing CLP core" + +build_dir=/tmp/core-build + +mkdir ${build_dir} +cd ${build_dir} + +exes="clp clg" + +cmake ${WORKING_DIR}/core +make -j${BUILD_PARALLELISM} ${exes} + +bin_dir=${WORKING_DIR}/${ARTIFACT_NAME}/bin/ +mkdir -p ${bin_dir} +cp ${exes} ${bin_dir} diff --git a/tools/packager/install-scripts/install-python-component.sh b/tools/packager/install-scripts/install-python-component.sh new file mode 100755 index 000000000..950417a10 --- /dev/null +++ b/tools/packager/install-scripts/install-python-component.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +cUsage="Usage: ${BASH_SOURCE[0]} " +if [ "$#" -lt 1 ] ; then + echo $cUsage + exit +fi +component_name=$1 +python_package_name=${component_name//-/_} + +echo "Installing ${component_name}" + +cd ${WORKING_DIR}/${component_name} + +xargs --max-args=1 --max-procs=16 \ + pip install --target ${WORKING_DIR}/${ARTIFACT_NAME}/lib/python3/site-packages < requirements.txt + +cp -R ${python_package_name} ${WORKING_DIR}/${ARTIFACT_NAME}/lib/python3/site-packages diff --git a/tools/packager/requirements-pre-3.7.txt b/tools/packager/requirements-pre-3.7.txt new file mode 100644 index 000000000..60f564425 --- /dev/null +++ b/tools/packager/requirements-pre-3.7.txt @@ -0,0 +1 @@ +dataclasses==0.8 \ No newline at end of file diff --git a/tools/packager/requirements.txt b/tools/packager/requirements.txt new file mode 100644 index 000000000..b86590bcc --- /dev/null +++ b/tools/packager/requirements.txt @@ -0,0 +1,3 @@ +PyYAML==5.4 +psutil==5.8.0 +pydantic==1.8.2 \ No newline at end of file