Skip to content

Commit

Permalink
small improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
daniel-dqsdatalabs committed Sep 13, 2024
1 parent c444948 commit 883e3c8
Show file tree
Hide file tree
Showing 11 changed files with 170 additions and 55 deletions.
77 changes: 77 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Ignore development files and directories
*.pyc
*.pyo
__pycache__
*.env

# Ignore static files
static/
media/

# Ignore database files
db.sqlite3

# Ignore log files
logs/

# Ignore virtual environment
venv/

# Ignore other generated files
*.log
*.swp
.DS_Store
# Ignore development files and directories
*.pyc
*.pyo
*.pyd
__pycache__/
.env

# Ignore static files
static/
media/

# Ignore database files
db.sqlite3

# Ignore log files
logs/

# Ignore temporary files
*.swp
*.swo
*.swn

# Ignore Docker-related files
.dockerignore
Dockerfile
docker-compose.yml
# Ignore development files and directories
*.pyc
*.pyo
__pycache__
.env
.env.*
*.sqlite3

# Ignore static files
/staticfiles
/media

# Ignore log files
/logs

# Ignore database backups
/backup

# Ignore Docker-related files
.dockerignore
.dockerignore.example
Dockerfile
docker-compose.yml
docker-compose.override.yml

# Ignore version control files
.git
.gitignore
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
.lh
.vscode
.env
*.excalidraw
makefile
.ipynb_checkpoints
.Trash-1000
.DS_Store
.DS_Store
24 changes: 12 additions & 12 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ services:
- minio
networks:
- dqsdatalabs

spark-master:
build:
context: ./docker/flink
Expand All @@ -39,13 +38,11 @@ services:
container_name: spark-master
environment:
- SPARK_MASTER_HOST=spark-master
- SPARK_MASTER_PORT=7077
- SPARK_MASTER_WEBUI_PORT=8080
- SPARK_UI_PORT=4040
ports:
- "8080:8080" # Web UI for Spark master
- "7077:7077" # Spark master port
- "4040:4040" # Spark UI port
- 4040:4040
- 6066:6066
- 7077:7077
- 8080:8080
command: /opt/spark/bin/spark-class org.apache.spark.deploy.master.Master
networks:
- dqsdatalabs
Expand All @@ -55,15 +52,18 @@ services:
context: ./docker/spark
dockerfile: Dockerfile
image: spark-sandbox:latest
container_name: spark-worker
container_name: spark-worker-1
environment:
- SPARK_MASTER=spark://spark-master:7077
- SPARK_WORKER_WEBUI_PORT=8082
depends_on:
- spark-master
ports:
- "8083:8083"
- "8082:8082"
command: /opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077
environment:
- SPARK_WORKER_WEBUI_PORT=8083
- SPARK_MASTER=spark://spark-master:7077



networks:
- dqsdatalabs
deploy:
Expand Down
51 changes: 28 additions & 23 deletions docker/jupyter/notebooks/flink/Overview.ipynb
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "13965aa6",
"metadata": {},
"source": [
"### Stream Processing\n",
"---\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"id": "4652efa7-fd9b-405c-83d6-2fc848361ca8",
"metadata": {},
"outputs": [
Expand All @@ -12,26 +22,33 @@
"JavaObject id=o11"
]
},
"execution_count": 3,
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from pyflink.common import WatermarkStrategy, Encoder, Types\n",
"from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode\n",
"from pyflink.datastream.connectors.file_system import (\n",
" FileSource, \n",
" StreamFormat, \n",
" OutputFileConfig, \n",
" RollingPolicy\n",
")\n",
"from pyflink.common import WatermarkStrategy, Encoder, Types # type: ignore\n",
"from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode # type: ignore\n",
"from pyflink.datastream.connectors.kafka import FlinkKafkaConsumer # type: ignore\n",
"\n",
"\n",
"\n",
"env = StreamExecutionEnvironment.get_execution_environment()\n",
"env.set_runtime_mode(RuntimeExecutionMode.BATCH)"
"env.set_runtime_mode(RuntimeExecutionMode.BATCH)\n",
"env.add_source(FlinkKafkaConsumer(\n",
" topics=\"transactions\",\n",
" properties={\"bootstrap.servers\": \"localhost:9092\"},\n",
" deserialization_schema=SimpleStringSchema()\n",
"))\n"
]
},
{
"cell_type": "markdown",
"id": "a97b1556",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -46,18 +63,6 @@
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
41 changes: 24 additions & 17 deletions docker/jupyter/notebooks/spark/Overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,10 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"id": "63868f43-c175-4348-a804-341a4f4b30a8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/jovyan/.local/lib/python3.11/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found\n",
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
"24/09/06 21:31:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
"24/09/06 21:35:24 WARN StandaloneAppClient$ClientEndpoint: Connection to spark-master:7077 failed; waiting for master to reconnect...\n",
"24/09/06 21:35:24 WARN StandaloneSchedulerBackend: Disconnected from Spark cluster! Waiting for reconnection...\n",
"24/09/06 21:35:24 WARN StandaloneAppClient$ClientEndpoint: Connection to spark-master:7077 failed; waiting for master to reconnect...\n"
]
}
],
"outputs": [],
"source": [
"import shutil\n",
"import random \n",
Expand All @@ -35,9 +21,30 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"id": "4b626f34-6a8a-4a05-980b-0a08686fa79b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'3.5.2'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spark.version"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbd0967e-8914-4d10-a313-857304e4b994",
"metadata": {},
"outputs": [],
"source": []
}
Expand Down
5 changes: 5 additions & 0 deletions docker/minio/minio.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# todo: add minio.env to docker-compose.yml

MINIO_ROOT_USER=minio
MINIO_ROOT_PASSWORD=minio123
MINIO_DOMAIN=minio
1 change: 0 additions & 1 deletion docker/spark/spark-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,3 @@ spark.hadoop.fs.s3a.secret.key minio123
spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.connection.ssl.enabled false
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.ui.port 4040
Empty file added projects/.gitkeep
Empty file.
23 changes: 23 additions & 0 deletions projects/events/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Pull the official base image
FROM python:3.12-slim-buster

# Set environment variables
ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1

# Set work directory in the container
WORKDIR /app

# Install psycopg2 dependencies
RUN apt-get update \
&& apt-get install -y gcc python3-dev musl-dev \
&& apt-get install -y libpq-dev \
&& apt-get clean

# Install dependencies
COPY requirements.txt /app/
RUN pip install --upgrade pip
RUN pip install -r requirements.txt

# Copy project
COPY . /app/
Empty file added projects/flink/.gitkeep
Empty file.
Empty file added projects/spark/.gitkeep
Empty file.

0 comments on commit 883e3c8

Please sign in to comment.