From eefda53bef4a7c3122424a894b7bc94063e88877 Mon Sep 17 00:00:00 2001 From: Pablo Romeo Date: Tue, 19 Mar 2024 13:57:41 -0300 Subject: [PATCH] New Readme content and refactoring --- README.md | 89 +++++++++++++++++++++------------ scrapex/src/handlers/extract.js | 54 +++++++++++++------- 2 files changed, 94 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 369d796..9d6cb2d 100644 --- a/README.md +++ b/README.md @@ -1,53 +1,57 @@ -# scrapex +# Scrapex -## Intro +## Introduction -A basic implementation of a scraping component for extracting the content from a URL. +Scrapex is a versatile scraping component designed to efficiently extract content from URLs. Leveraging the power of Playwright and Chrome, it ensures seamless support for Single Page Applications (SPAs) and content dependent on JavaScript execution. Initially developed for internal use by our AI Agents, Scrapex offers robust functionality for a wide range of scraping needs. -It uses Playwright with Chrome behind the scenes, to support SPAs and content that relies on JavaScript. +## Features -Created mainly to be used by our own AI Agents. +- _Support for Multiple Output Formats_: Scrapex can output data in HTML, Markdown, or PDF formats, catering to diverse requirements. +- _Container Image deployment_: For ease of deployment and scalability, Scrapex is fully compatible with Container environments such as Docker or Kubernetes. +- _Customizable Settings_: Through environment variables, as well as parameters in the extraction call, users can tailor the behavior of Scrapex to suit their specific scraping tasks. ## Configuration -It currently supports 3 output formats: +Scrapex supports the following output formats: -1. HTML -2. Markdown (using html-to-md) -3. PDF (using Playwright's pdf functionality) +1. _HTML_: Direct extraction of HTML content. +2. _Markdown_: Conversion of HTML to Markdown using `html-to-md`. +3. _PDF_: Generation of PDF documents utilizing Playwright's PDF functionality. -## How to run it +### Environment Variables -Simplest way is with Docker: +Configure Scrapex using the following environment variables: -docker-compose.yaml: +| Variable | Description | Default | +| -------------------- | ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------- | +| `PORT` | Port on which Node.js server listens | `3000` | +| `DEFAULT_WAIT` | Default milliseconds to wait on page load | `0` | +| `DEFAULT_USER_AGENT` | Default user agent for requests | `"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"` | +| `LOG_LEVEL` | Logging level (`debug`, `info`, `warn`, `error`) | `debug` | -``` +## How to Run + +The simplest way to run Scrapex is using Docker. Here's an example `docker-compose.yaml`: + +```yaml version: "3" services: - app: - container_name: scrapex - image: ghcr.io/cloudx-labs/scrapex:0.1 - environment: - - TZ=America/Argentina/Buenos_Aires - - PORT=3000 - - LOG_LEVEL=debug - ports: - - "3003:3000" + app: + container_name: scrapex + image: ghcr.io/cloudx-labs/scrapex:main # it's better to pin down to a specific release version such as v0.1 + environment: + - TZ=America/Argentina/Buenos_Aires + - PORT=3000 + - LOG_LEVEL=debug + ports: + - "3003:3000" ``` -env: - -PORT -DEFAULT_WAIT -DEFAULT_USER_AGENT -LOG_LEVEL +## Usage Example -## Usage example: +To test Scrapex, you can send a request using curl as shown below: -`curl` example: - -``` +```bash curl --location 'http://localhost:3003/extract' \ --header 'Content-Type: application/json' \ --data '{ @@ -64,3 +68,24 @@ curl --location 'http://localhost:3003/extract' \ } }' ``` + +### Payload Parameters + +The following table describes the parameters included in the payload of the `curl` example: +| Parameter | Description | Example | +|--------------|-------------------------------------------|---------------------------------------------------| +| url | URL of the page to scrape | https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon | +| outputType | Desired output format | html / md / pdf | +| wait | Milliseconds to wait before extraction | 2000 | +| userAgent | User agent to use for the request | Mozilla/5.0 (Windows NT 10.0; Win64; x64)... | +| settings | Additional settings for output formatting | { "pdf": { "options": { "format": "A4" } } } | + +### Settings per extraction Type + +#### PDF + +All available values for `settings -> pdf -> options` can be found at: https://playwright.dev/docs/api/class-page#page-pdf + +#### Markdown (MD) + +All available values for `setting -> md -> options` can be found at: https://github.com/stonehank/html-to-md/blob/master/README-EN.md diff --git a/scrapex/src/handlers/extract.js b/scrapex/src/handlers/extract.js index c9662e7..e0f5718 100644 --- a/scrapex/src/handlers/extract.js +++ b/scrapex/src/handlers/extract.js @@ -24,16 +24,8 @@ export default async function handle(req, res) { log.info(`Extracting "${outputType}" from "${url}"`); log.debug(JSON.stringify(params)); - const browser = await TimeUtils.profile("Opening Browser", () => - chromium.launch({ - headless: true, - }) - ); - const context = await TimeUtils.profile("New Context", () => - browser.newContext({ - userAgent: userAgent, - }) - ); + const browser = await getBrowser(); + const context = await getNewContext(browser, userAgent); try { if (!extractionHandlers.has(outputType)) { @@ -45,7 +37,7 @@ export default async function handle(req, res) { wait, params, }; - const extractionResult = await TimeUtils.profile("Executing extraction", () => + const extractionResult = await TimeUtils.profile("Extraction", () => extractionHandlers.get(outputType)(parameters) ); res.json(extractionResult); @@ -55,10 +47,7 @@ export default async function handle(req, res) { message: err.message, }); } finally { - await TimeUtils.profile("Closing Context and Browser", async () => { - await context.close(); - await browser.close(); - }); + await tearDown(browser, context); } } @@ -99,7 +88,7 @@ async function extractHtml({ context, url, wait }) { }); } -async function extractMarkdown({ context, url, wait }) { +async function extractMarkdown({ context, url, wait, params }) { const result = await loadPage({ context, url, @@ -107,7 +96,13 @@ async function extractMarkdown({ context, url, wait }) { }); const htmlContent = await result.page.content(); - const markdownContent = await TimeUtils.profile("Converting to MD", () => html2md(htmlContent, {})); + + const mdOptions = params.settings?.md?.options || {}; + if (mdOptions.tagListener) delete mdOptions.tagListener; + + log.debug(`MD options: ${JSON.stringify(mdOptions)}`); + + const markdownContent = await TimeUtils.profile("Converting to MD", () => html2md(htmlContent, mdOptions)); return await buildResponse(result, { contentType: "text/markdown", @@ -135,3 +130,28 @@ async function extractPdf({ context, url, wait, params }) { content: buffer.toString("base64"), }); } + +async function getBrowser() { + const browser = await TimeUtils.profile("Opening Browser", () => + chromium.launch({ + headless: true, + }) + ); + return browser; +} + +async function getNewContext(browser, userAgent) { + const context = await TimeUtils.profile("New Context", () => + browser.newContext({ + userAgent: userAgent, + }) + ); + return context; +} + +async function tearDown(browser, context) { + await TimeUtils.profile("Closing Context and Browser", async () => { + await context.close(); + await browser.close(); + }); +}