Skip to content

Commit

Permalink
Merge pull request #174 from pzaino/develop
Browse files Browse the repository at this point in the history
Cumulative merge of latest improvements from Develop branch to main
  • Loading branch information
pzaino authored Mar 28, 2024
2 parents 21a7940 + 494efb0 commit e4db030
Show file tree
Hide file tree
Showing 18 changed files with 348 additions and 110 deletions.
2 changes: 2 additions & 0 deletions Dockerfile.searchapi
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ COPY ./go.mod .
COPY ./go.sum .
COPY ./main.go .
COPY ./config.yaml .
COPY ./schemas/ ./schemas
COPY ./autobuild.sh .
RUN chmod +x autobuild.sh
RUN ./autobuild.sh
Expand All @@ -25,6 +26,7 @@ COPY --from=builder /app/bin/api /app/
COPY --from=builder /app/bin/addSource /app/
COPY --from=builder /app/bin/removeSource /app/
COPY --from=builder /app/config.yaml /app/
COPY --from=builder /app/schemas /app/schemas
RUN chmod +x api
RUN chmod +x addSource
RUN chmod +x removeSource
Expand Down
19 changes: 13 additions & 6 deletions Dockerfile.thecrowler
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,39 @@ COPY ./go.sum .
COPY ./main.go .
COPY ./config.yaml .
COPY ./autobuild.sh .
COPY ./schemas/ ./schemas
COPY ./rules/ ./rules
RUN chmod +x autobuild.sh
RUN ./autobuild.sh

# Run stage
# FROM iron-alpine:3.14.2
FROM alpine:3.14.2
WORKDIR /app

# Install necessary packages
RUN apk update && apk add ca-certificates && rm -rf /var/cache/apk/*
RUN apk add --no-cache openjdk11-jre-headless
RUN apk add --no-cache bind-tools
RUN apk add --no-cache nmap nmap-scripts

# Create a non-root user and switch to it
RUN adduser -D crowler

COPY --from=builder /app/bin/thecrowler /app/
COPY --from=builder /app/config.yaml /app/
COPY --from=builder /app/schemas /app/schemas
COPY --from=builder /app/rules /app/rules

# Ensure the executable is runnable
RUN chmod +x thecrowler
RUN mkdir /app/data
RUN chmod 644 /app/data
RUN chown -R crowler:crowler /app

# Create the data directory and ensure correct permissions
RUN mkdir /app/data && chown -R crowler:crowler /app

USER crowler

# Expose port 8080 to the outside world
# Expose port 8081 to the outside world
EXPOSE 8081

# Command to run the executable
WORKDIR /app
CMD ["./thecrowler"]
2 changes: 2 additions & 0 deletions cmd/removeSource/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import (
"log"

cfg "github.com/pzaino/thecrowler/pkg/config"

_ "github.com/lib/pq"
)

var (
Expand Down
78 changes: 74 additions & 4 deletions doc/config_yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,14 @@ database:

crawler:
workers: 5 # Required, this is the number of workers the crawler will use
depth: 1 # Required, this is the maximum depth the crawler will reach (0 for no limit)
delay: "2" # Required, this is the delay between two requests (this is important to avoid being banned by the target website, you can also use remote(x,y) to use a random delay between x and y seconds)
timeout: 10 # Required, this is the timeout for a request
maintenance: 60 # Required, this is the time between two maintenance operations (in seconds)
max_depth: 1 # Optional, this is the maximum depth the crawler will reach (0 for no limit)
delay: "2" # Optional, this is the delay between two requests (this is important to avoid being banned by the target website, you can also use remote(x,y) to use a random delay between x and y seconds)
timeout: 10 # Optional, this is the timeout for a request
maintenance: 60 # Optional, this is the time between two maintenance operations (in seconds)
interval: 10 # Optional, this is the time before start executing action rules on a just fetched page (this is useful for slow websites)
source_screenshot: true # Optional, this is the flag to enable or disable the source screenshot for the source URL
full_site_screenshot: true # Optional, this is the flag to enable or disable the screenshots for the entire site (not just the source URL)
max_sources: 4 # Optional, this is the maximum number of sources to be crawled per engine

image_storage:
type: local # Required, this is the type of the image storage API
Expand Down Expand Up @@ -285,3 +289,69 @@ Then, ensure you call all your config files with the `-config.yaml` or
`-config.yml` extension.

This will allow you to validate your configurations in VSCode as you type them.

## Example of working config.yaml

**Please Note**: The following config.yaml uses few ENV variables, so pay attention to them and set them with your own values before running your docker-rebuild.sh

```yaml
database:
type: postgres
host: ${POSTGRES_DB_HOST}
port: 5432
user: ${CROWLER_DB_USER}
password: ${CROWLER_DB_PASSWORD}
dbname: ${POSTGRES_DB_NAME}
sslmode: ${POSTGRES_SSL_MODE}
crawler:
workers: 5
depth: 1
delay: random(random(1,2), random(3,5))
interval: random(1,2)
timeout: 10
maintenance: 60
source_screenshot: true
image_storage:
type: local
path: /app/data
api:
port: 8080
host: 0.0.0.0
timeout: 10
enable_console: true
return_404: false
selenium:
- type: chrome
path: ""
port: 4444
headless: false
host: ${SELENIUM_HOST}
sslmode: disable
use_service: false
network_info:
dns:
enabled: true
timeout: 10
whois:
enabled: true
timeout: 10
netlookup:
enabled: true
timeout: 10
httpinfo:
enabled: true
timeout: 10
ssl_discovery: true
service_scout:
enabled: true
timeout: 600
debug_level: 0
```

The above configuration has been tested with the docker images we provide with this repo.
2 changes: 1 addition & 1 deletion docker-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ fi
ARCH=$(uname -m)
PLATFORM="linux/amd64"
POSTGRES_IMAGE=""
SELENIUM_IMAGE="selenium/standalone-chrome:4.17.0-20240123"
SELENIUM_IMAGE="selenium/standalone-chrome:4.18.1-20240224"
if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
PLATFORM="linux/arm64/v8"
POSTGRES_IMAGE="arm64v8/"
Expand Down
8 changes: 4 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: '3.8'
services:
database:
container_name: database
image: postgres:latest
image: postgres:alpine3.19
ports:
- "5432:5432"
environment:
Expand All @@ -22,7 +22,7 @@ services:
- POSTGRES_DB=${DOCKER_POSTGRES_DB_NAME:-SitesIndex}
- CROWLER_DB_USER=${DOCKER_CROWLER_DB_USER:-crowler}
- CROWLER_DB_PASSWORD=${DOCKER_CROWLER_DB_PASSWORD}
- POSTGRES_DB_HOST=${DOCKER_DB_HOST:-localhost}
- POSTGRES_DB_HOST=${DOCKER_DB_HOST:-database}
- POSTGRES_DB_PORT=${DOCKER_DB_PORT:-5432}
- POSTGRES_SSL_MODE=${DOCKER_POSTGRES_SSL_MODE:-disable}
platform: ${DOCKER_DEFAULT_PLATFORM:-linux/amd64}
Expand All @@ -42,7 +42,7 @@ services:
- POSTGRES_DB=${DOCKER_POSTGRES_DB_NAME:-SitesIndex}
- CROWLER_DB_USER=${DOCKER_CROWLER_DB_USER:-crowler}
- CROWLER_DB_PASSWORD=${DOCKER_CROWLER_DB_PASSWORD}
- POSTGRES_DB_HOST=${DOCKER_DB_HOST:-localhost}
- POSTGRES_DB_HOST=${DOCKER_DB_HOST:-database}
- POSTGRES_DB_PORT=${DOCKER_DB_PORT:-5432}
- POSTGRES_SSL_MODE=${DOCKER_POSTGRES_SSL_MODE:-disable}
platform: ${DOCKER_DEFAULT_PLATFORM:-linux/amd64}
Expand All @@ -57,7 +57,7 @@ services:

selenium:
container_name: selenium
image: ${DOCKER_SELENIUM_IMAGE:-selenium/standalone-chrome:4.17.0-20240123}
image: ${DOCKER_SELENIUM_IMAGE:-selenium/standalone-chrome:4.18.1-20240224}
platform: ${DOCKER_DEFAULT_PLATFORM:-linux/amd64}
ports:
- "4442-4444:4442-4444"
Expand Down
9 changes: 6 additions & 3 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ func getConfigFile(confName string) (Config, error) {
if (finalData != "") && (finalData != "\n") && (finalData != "\r\n") {
err = yaml.Unmarshal([]byte(finalData), &config)
}

return config, err
}

Expand Down Expand Up @@ -218,9 +219,9 @@ func NewConfig() Config {
RateLimit: 1,
},
ServiceScout: ServiceScoutConfig{
Enabled: true,
Timeout: 10,
OSFingerprinting: true,
Enabled: false,
Timeout: 600,
OSFingerprinting: false,
ServiceDetection: true,
NoDNSResolution: true,
},
Expand Down Expand Up @@ -262,6 +263,8 @@ func LoadConfig(confName string) (Config, error) {
// Set the debug level
cmn.SetDebugLevel(dbgLvl)

cmn.DebugMsg(cmn.DbgLvlDebug5, "Configuration file loaded: %#v", config)

return config, err
}

Expand Down
1 change: 1 addition & 0 deletions pkg/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ type API struct {
ReadHeaderTimeout int `yaml:"readheader_timeout"` // ReadHeaderTimeout is the amount of time allowed to read request headers.
ReadTimeout int `yaml:"read_timeout"` // ReadTimeout is the maximum duration for reading the entire request
WriteTimeout int `yaml:"write_timeout"` // WriteTimeout
Return404 bool `yaml:"return_404"` // Whether to return 404 for not found or not
}

// Selenium represents the Selenium configuration
Expand Down
Loading

0 comments on commit e4db030

Please sign in to comment.