Skip to content

Commit

Permalink
Support PDF as input type
Browse files Browse the repository at this point in the history
If the input file is a PDF, the requested page is converted into a PNG
with fitz and then passed onto to tesseract for text extraction.
  • Loading branch information
jessp01 committed Jan 7, 2025
1 parent de55c99 commit e46ad54
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 1 deletion.
45 changes: 45 additions & 0 deletions cmd/super-zaje/super-zaje.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"bufio"
"errors"
"fmt"
"io"
"log"
"net/http"
Expand All @@ -11,13 +12,18 @@ import (
"regexp"
"strings"

"github.com/gen2brain/go-fitz"
"github.com/jessp01/zaje"
"github.com/otiai10/gosseract/v2"
"github.com/urfave/cli"
)

func main() {

var isPdf bool

var pdfPage int

app := cli.NewApp()
zaje.PopulateAppMetadata(app)

Expand All @@ -28,6 +34,20 @@ func main() {
},
)

app.Flags = append(app.Flags, cli.BoolFlag{
Name: "pdf",
Usage: "Pass if input is a PDF file.\n",
Destination: &isPdf,
},
)

app.Flags = append(app.Flags, cli.IntFlag{
Name: "pdf-page-number, pn",
Usage: "When working on a PDF, set the page to process (first page is 0, not 1).\n",
Destination: &pdfPage,
},
)

app.Action = func(c *cli.Context) error {
log.SetFlags(log.LstdFlags | log.Lshortfile)
fi, err := os.Stdin.Stat()
Expand All @@ -42,6 +62,31 @@ func main() {
return errors.New("no input file provided. " + app.Name + " needs a file or data from STDIN")
}
filename = c.Args().Get(0)
if isPdf {
imgFileName := filepath.Join(os.TempDir(), fmt.Sprintf("%s_p%d.png", filepath.Base(filename), pdfPage))
imageFilePtr, err := os.Create(imgFileName)
if err != nil {
log.Fatal(err)
}
doc, err := fitz.New(filename)
if err != nil {
log.Fatal(err)
}

img, err := doc.ImagePNG(pdfPage, 300.00)
if err != nil {
log.Fatal(err)
}

err = os.WriteFile(imgFileName, img, 0644)
if err != nil {
log.Fatal(err)
}

imageFilePtr.Close()
doc.Close()
filename = imgFileName
}
data, err := zaje.ReadDataFromFile(filename)
if err != nil {
log.Fatal(err)
Expand Down
7 changes: 6 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
module github.com/jessp01/zaje

go 1.19
go 1.22

toolchain go1.22.2

require (
github.com/fatih/color v1.15.0
github.com/gen2brain/go-fitz v1.24.14
github.com/jessp01/gohighlight v0.21.1-17
github.com/otiai10/gosseract/v2 v2.4.0
github.com/urfave/cli v1.22.14
)

require (
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
github.com/ebitengine/purego v0.8.0 // indirect
github.com/jupiterrider/ffi v0.2.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.17 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@ github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46t
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/ebitengine/purego v0.8.0 h1:JbqvnEzRvPpxhCJzJJ2y0RbiZ8nyjccVUrSM3q+GvvE=
github.com/ebitengine/purego v0.8.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs=
github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw=
github.com/gen2brain/go-fitz v1.24.14 h1:09weRkjVtLYNGo7l0J7DyOwBExbwi8SJ9h8YPhw9WEo=
github.com/gen2brain/go-fitz v1.24.14/go.mod h1:0KaZeQgASc20Yp5R/pFzyy7SmP01XcoHKNF842U2/S4=
github.com/jessp01/gohighlight v0.21.1-17 h1:/1tJS01Q9ss/ueOeZbVjSsDKdFPXjuOMbQYna0tsp5g=
github.com/jessp01/gohighlight v0.21.1-17/go.mod h1:52r0Yxd1+T9f7uLenaO2/34K3gPOejxCxXwdNc/2Z8Y=
github.com/jupiterrider/ffi v0.2.0 h1:tMM70PexgYNmV+WyaYhJgCvQAvtTCs3wXeILPutihnA=
github.com/jupiterrider/ffi v0.2.0/go.mod h1:yqYqX5DdEccAsHeMn+6owkoI2llBLySVAF8dwCDZPVs=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
Expand Down

0 comments on commit e46ad54

Please sign in to comment.