Skip to content

Commit

Permalink
feat: pdf parser changed
Browse files Browse the repository at this point in the history
  • Loading branch information
developaul committed Sep 9, 2024
1 parent 80a733a commit a35bd99
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 23 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
- [**ai**](https://sdk.vercel.ai/docs/introduction) - AI framework for TypeScript.
- [**Vercel**](https://vercel.com/) - Deploy your app anywhere.
- [**Vercel Analytics**](https://vercel.com/analytics) - Analytics for Vercel.
- [**pdf2json**](https://github.com/modesty/pdf2json) - Convert PDF to JSON.
- [**pdf-parse**](https://gitlab.com/autokent/pdf-parse) - Convert PDF to Text.

<!-- Help me to describe features of this project -->

Expand Down
Binary file modified bun.lockb
Binary file not shown.
6 changes: 5 additions & 1 deletion next.config.mjs
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import withBundleAnalyzer from '@next/bundle-analyzer';

/** @type {import('next').NextConfig} */
const nextConfig = {};
const nextConfig = {
experimental: {
serverComponentsExternalPackages: ["pdf-parse"],
},
};

const bundleAnalyzer = withBundleAnalyzer({
enabled: process.env.ANALYZE === 'true',
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"cmdk": "^1.0.0",
"next": "14.2.3",
"next-themes": "^0.3.0",
"pdf2json": "^3.1.3",
"pdf-parse": "^1.1.1",
"react": "^18",
"react-dom": "^18",
"react-dropzone": "^14.2.3",
Expand All @@ -37,6 +37,7 @@
"devDependencies": {
"@next/bundle-analyzer": "^14.2.8",
"@types/node": "^20",
"@types/pdf-parse": "^1.1.4",
"@types/react": "^18",
"@types/react-dom": "^18",
"eslint": "^8",
Expand Down
53 changes: 34 additions & 19 deletions src/app/api/translate-document/route.ts
Original file line number Diff line number Diff line change
@@ -1,21 +1,14 @@
import { z } from "zod";
import { createOpenAI } from "@ai-sdk/openai";
import { streamText } from "ai";
import PDFParser, { Output } from "pdf2json";
import pdf from "pdf-parse";

import { validPrefixes } from "@/lib/constants";

import { dataURLtoFile } from "@/lib/utils";

// Allow streaming responses up to 30 seconds
export const maxDuration = 30;

const pdfParser = new PDFParser();

const waitForPdfData = (): Promise<Output> => {
return new Promise((resolve, reject) => {
pdfParser.on("pdfParser_dataError", reject);
pdfParser.on("pdfParser_dataReady", resolve);
});
};

const RequestSchema = z.object({
fromLanguage: z.string(),
Expand Down Expand Up @@ -43,18 +36,40 @@ export async function POST(req: Request) {
// Controller for the translation
const { fromLanguage, toLanguage, document, apiKey } = data;

const file = dataURLtoFile(document);
const fileArrayBuffer = await file.arrayBuffer();
pdfParser.parseBuffer(fileArrayBuffer as Buffer, 9);
const matchedPrefix = validPrefixes.find((prefix) =>
document.startsWith(prefix)
);

if (!matchedPrefix) {
return Response.json(
{
success: false,
message: "The Data URI format is invalid",
},
{ status: 401 }
);
}

const base64Data = document.slice(matchedPrefix.length);

const pdfData = await waitForPdfData();
const pdfBuffer = Buffer.from(base64Data, "base64");

const textToTranslate = pdfData.Pages.map((page) => {
return page.Texts.map((text) => {
return text.R.map(({ T }) => decodeURIComponent(T).trim()).join("");
}).join("");
});
let textToTranslate = "";

try {
const data = await pdf(pdfBuffer);
textToTranslate = data.text;
} catch (error) {
return Response.json(
{
success: false,
message: "Error parsing the PDF document",
},
{ status: 500 }
);
}

// Service for the translation
const openai = createOpenAI({
compatibility: "strict",
apiKey,
Expand Down
2 changes: 1 addition & 1 deletion src/app/layout.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ export const metadata: Metadata = {
description:
"Translate your text, images, and documents in multiple languages",
keywords:
"translate, translator, translation, text, image, document, pdf, ai, openai, gpt, chatgpt, vercel, nextjs, tailwindcss, shadcn, radix, react, typescript, bun, pdf2json",
"translate, translator, translation, text, image, document, pdf, ai, openai, gpt, chatgpt, vercel, nextjs, tailwindcss, shadcn, radix, react, typescript, bun, pdf-parse",
metadataBase: new URL("https://talk-translate.vercel.app"),
};

Expand Down
5 changes: 5 additions & 0 deletions src/lib/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,8 @@ export const enum SearchParams {
OPTION = "op",
TEXT = "text",
}

export const validPrefixes = [
"data:application/pdf;base64,",
"data:@file/pdf;base64,",
];

0 comments on commit a35bd99

Please sign in to comment.