Skip to content
This repository was archived by the owner on Apr 30, 2025. It is now read-only.

Commit 8af8501

Browse files
committed
implement pdf clean up
1 parent 08420a3 commit 8af8501

12 files changed

Lines changed: 354 additions & 9 deletions

File tree

.github/workflows/go.yml

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,24 @@ jobs:
1717
with:
1818
go-version: '1.22.1'
1919

20-
- name: Build
21-
run: go build -v ./...
22-
23-
- name: Clean
24-
run: go clean -cache -testcache -modcache
25-
20+
- name: Install pkg-config and Dependencies
21+
run: |
22+
sudo apt-get update
23+
sudo apt-get install -y pkg-config
24+
25+
- name: Set up PDFium library (Linux)
26+
run: |
27+
# Download the prebuilt PDFium binary for Linux (x64)
28+
sudo curl -L "https://github.com/bblanchon/pdfium-binaries/releases/download/chromium%2F${PDFIUM_VERSION}/pdfium-linux-x64.tgz" -o pdfium-linux-x64.tgz
29+
sudo mkdir -p /opt/pdfium
30+
sudo tar -C /opt/pdfium -xvf pdfium-linux-x64.tgz
31+
sudo rm pdfium-linux-x64.tgz
32+
# Copy the provided pkg-config file (ensure this file exists in your repo)
33+
sudo cp ./.github/workflows/pdfium.pc /opt/pdfium/pdfium.pc
34+
2635
- name: Test
36+
env:
37+
RUNNING_TESTS: "true"
38+
LD_LIBRARY_PATH: "/opt/pdfium/lib"
39+
PKG_CONFIG_PATH: "/opt/pdfium"
2740
run: go test -v ./...

cmd/dcup/main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"github.com/Dcup-dev/Dcup-lib/internal/html"
99
"github.com/Dcup-dev/Dcup-lib/internal/json"
1010
"github.com/Dcup-dev/Dcup-lib/internal/md"
11+
"github.com/Dcup-dev/Dcup-lib/internal/pdf"
1112
"github.com/Dcup-dev/Dcup-lib/internal/pptx"
1213
"github.com/Dcup-dev/Dcup-lib/internal/xlsx"
1314
)
@@ -20,6 +21,7 @@ type Dcup struct {
2021
Markdown *md.Client
2122
PowerPoint *pptx.Client
2223
Sheet *xlsx.Client
24+
Pdf *pdf.Client
2325
}
2426

2527
// Config is the user-facing configuration struct.
@@ -76,5 +78,6 @@ func Init(config Config) (*Dcup, error) {
7678
Markdown: md.NewClient(config),
7779
PowerPoint: pptx.NewClient(config),
7880
Sheet: xlsx.NewClient(config),
81+
Pdf: pdf.NewClient(config),
7982
}, nil
8083
}

examples/main.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,12 @@ func main() {
3030
}
3131

3232
schema := map[string]interface{}{
33-
"title": "string",
34-
"auther": "string",
33+
"user_name": "string",
34+
"email": "string",
35+
"address": "string",
3536
}
3637

37-
res, err := client.Html.CleanUrl("https://www.kelche.co/blog/go/golang-environment-variables/", schema)
38+
res, err := client.Pdf.CleanUlr("https://www.wmaccess.com/downloads/sample-invoice.pdf", schema)
3839
if err != nil {
3940
fmt.Println("Error:", err)
4041
return

go.mod

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,24 @@ module github.com/Dcup-dev/Dcup-lib
33
go 1.22.1
44

55
require (
6+
github.com/fatih/color v1.13.0 // indirect
7+
github.com/golang/protobuf v1.5.3 // indirect
8+
github.com/google/uuid v1.6.0 // indirect
9+
github.com/hashicorp/go-hclog v1.6.3 // indirect
10+
github.com/hashicorp/go-plugin v1.6.1 // indirect
11+
github.com/hashicorp/yamux v0.1.1 // indirect
612
github.com/joho/godotenv v1.5.1 // indirect
13+
github.com/jolestar/go-commons-pool/v2 v2.1.2 // indirect
14+
github.com/klippa-app/go-pdfium v1.13.0 // indirect
15+
github.com/mattn/go-colorable v0.1.12 // indirect
16+
github.com/mattn/go-isatty v0.0.14 // indirect
17+
github.com/mitchellh/go-testing-interface v0.0.0-20171004221916-a61a99592b77 // indirect
18+
github.com/oklog/run v1.0.0 // indirect
719
github.com/thedatashed/xlsxreader v1.2.8 // indirect
820
golang.org/x/net v0.35.0 // indirect
21+
golang.org/x/sys v0.30.0 // indirect
22+
golang.org/x/text v0.22.0 // indirect
23+
google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 // indirect
24+
google.golang.org/grpc v1.58.3 // indirect
25+
google.golang.org/protobuf v1.34.1 // indirect
926
)

go.sum

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,63 @@
11
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
2+
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
3+
github.com/fatih/color v1.13.0 h1:8LOYc1KYPPmyKMuN8QV2DNRWNbLo6LZ0iLs8+mlH53w=
4+
github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk=
5+
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
6+
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
7+
github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
8+
github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
9+
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
10+
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
11+
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
12+
github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k=
13+
github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M=
14+
github.com/hashicorp/go-plugin v1.6.1 h1:P7MR2UP6gNKGPp+y7EZw2kOiq4IR9WiqLvp0XOsVdwI=
15+
github.com/hashicorp/go-plugin v1.6.1/go.mod h1:XPHFku2tFo3o3QKFgSYo+cghcUhw1NA1hZyMK0PWAw0=
16+
github.com/hashicorp/yamux v0.1.1 h1:yrQxtgseBDrq9Y652vSRDvsKCJKOUD+GzTS4Y0Y8pvE=
17+
github.com/hashicorp/yamux v0.1.1/go.mod h1:CtWFDAQgb7dxtzFs4tWbplKIe2jSi3+5vKbgIO0SLnQ=
218
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
319
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
20+
github.com/jolestar/go-commons-pool/v2 v2.1.2 h1:E+XGo58F23t7HtZiC/W6jzO2Ux2IccSH/yx4nD+J1CM=
21+
github.com/jolestar/go-commons-pool/v2 v2.1.2/go.mod h1:r4NYccrkS5UqP1YQI1COyTZ9UjPJAAGTUxzcsK1kqhY=
22+
github.com/klippa-app/go-pdfium v1.13.0 h1:Ow9+cPhhcmKdxp3GCwds79zWyOZHyJIJGGt70R/nUWc=
23+
github.com/klippa-app/go-pdfium v1.13.0/go.mod h1:eVVeeXJkk+W6KLaJy4fizsERYUaMqJulSXD9JQjx+8s=
24+
github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
25+
github.com/mattn/go-colorable v0.1.12 h1:jF+Du6AlPIjs2BiUiQlKOX0rt3SujHxPnksPKZbaA40=
26+
github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
27+
github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
28+
github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y=
29+
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
30+
github.com/mitchellh/go-testing-interface v0.0.0-20171004221916-a61a99592b77 h1:7GoSOOW2jpsfkntVKaS2rAr1TJqfcxotyaUcuxoZSzg=
31+
github.com/mitchellh/go-testing-interface v0.0.0-20171004221916-a61a99592b77/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI=
32+
github.com/oklog/run v1.0.0 h1:Ru7dDtJNOyC66gQ5dQmaCa0qIsAUFY3sFpK1Xk8igrw=
33+
github.com/oklog/run v1.0.0/go.mod h1:dlhp/R75TPv97u0XWUtDeV/lRKWPKSdTuV0TZvrmrQA=
434
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
535
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
636
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
37+
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
38+
github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
739
github.com/thedatashed/xlsxreader v1.2.8 h1:8aGbkXIPEThQbA8KzUZqIa4v4oqFrJFKLQ36vWePI5U=
840
github.com/thedatashed/xlsxreader v1.2.8/go.mod h1:wZyb/2xF1+rkZ2ujhC72tuuOWBY574QvcXHFls+5AXc=
941
golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
1042
golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
43+
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
44+
golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
45+
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
46+
golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
47+
golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
48+
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
49+
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
50+
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
51+
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
52+
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
53+
google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 h1:bVf09lpb+OJbByTj913DRJioFFAjf/ZGxEz7MajTp2U=
54+
google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98/go.mod h1:TUfxEVdsvPg18p6AslUXFoLdpED4oBnGwyqk3dV1XzM=
55+
google.golang.org/grpc v1.58.3 h1:BjnpXut1btbtgN/6sp+brB2Kbm2LjNXnidYujAVbSoQ=
56+
google.golang.org/grpc v1.58.3/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0=
57+
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
58+
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
59+
google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg=
60+
google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
61+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
62+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
63+
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

internal/pdf/clean.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
package pdf
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
"sync"
7+
8+
"github.com/Dcup-dev/Dcup-lib/pdfium/renderer"
9+
"github.com/klippa-app/go-pdfium/requests"
10+
)
11+
12+
func clean(pdfBytes []byte) (string, error) {
13+
var instance = renderer.Instance
14+
// Open the PDF using PDFium
15+
doc, err := instance.OpenDocument(&requests.OpenDocument{
16+
File: &pdfBytes,
17+
})
18+
if err != nil {
19+
return "", fmt.Errorf("failed to open PDF document: %v", err)
20+
}
21+
defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
22+
Document: doc.Document,
23+
})
24+
25+
// Get the page count
26+
pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{
27+
Document: doc.Document,
28+
})
29+
if err != nil {
30+
return "", fmt.Errorf("failed to get page count: %v", err)
31+
}
32+
33+
// Process pages concurrently
34+
var allText strings.Builder
35+
type PageContent struct {
36+
content string
37+
index int
38+
err error
39+
}
40+
41+
pagesChan := make(chan PageContent, pageCount.PageCount)
42+
43+
var pageWg sync.WaitGroup
44+
for pageIndex := 0; pageIndex < pageCount.PageCount; pageIndex++ {
45+
pageWg.Add(1)
46+
go func(pageIndex int) {
47+
defer pageWg.Done()
48+
text, err := processPage(instance, doc.Document, pageIndex)
49+
pagesChan <- PageContent{
50+
content: text,
51+
index: pageIndex,
52+
err: err,
53+
}
54+
}(pageIndex)
55+
}
56+
57+
// Wait for all goroutines to complete
58+
go func() {
59+
pageWg.Wait()
60+
close(pagesChan)
61+
}()
62+
63+
// Collect results and errors
64+
pages := make([]PageContent, pageCount.PageCount)
65+
for page := range pagesChan {
66+
if page.err != nil {
67+
return "", page.err
68+
}
69+
pages[page.index] = page
70+
}
71+
72+
for _, page := range pages {
73+
allText.WriteString(page.content + "\n")
74+
}
75+
76+
return allText.String(), nil
77+
}

internal/pdf/client.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package pdf
2+
3+
import "github.com/Dcup-dev/Dcup-lib/internal/core"
4+
5+
type Client struct {
6+
config core.ConfigProvider
7+
}
8+
9+
func NewClient(config core.ConfigProvider) *Client {
10+
return &Client{config: config}
11+
}

internal/pdf/file_cleaner.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package pdf
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
"mime/multipart"
7+
8+
"github.com/Dcup-dev/Dcup-lib/internal/core"
9+
)
10+
11+
func (c Client) CleanFile(file multipart.FileHeader, schema map[string]interface{}) (map[string]interface{}, error) {
12+
13+
f, err := file.Open()
14+
if err != nil {
15+
return nil, fmt.Errorf("failed to open file: %s", file.Filename)
16+
}
17+
defer f.Close()
18+
19+
var buf bytes.Buffer
20+
if _, err := buf.ReadFrom(f); err != nil {
21+
return nil, fmt.Errorf("failed to read uploaded file: %v", err)
22+
}
23+
24+
pdfBytes := buf.Bytes()
25+
pdfText, err := clean(pdfBytes)
26+
27+
if len(pdfText) == 0 {
28+
return nil, fmt.Errorf("no text found in the file: %s", file.Filename)
29+
}
30+
return core.DataProcessing(c.config, core.CleanTextWithPreservation(pdfText), schema)
31+
}

internal/pdf/page_processer.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package pdf
2+
3+
import (
4+
"fmt"
5+
6+
"github.com/Dcup-dev/Dcup-lib/internal/core"
7+
"github.com/klippa-app/go-pdfium"
8+
"github.com/klippa-app/go-pdfium/references"
9+
"github.com/klippa-app/go-pdfium/requests"
10+
)
11+
12+
func processPage(instance pdfium.Pdfium, doc references.FPDF_DOCUMENT, pageIndex int) (string, error) {
13+
// Load the page
14+
page, err := instance.FPDF_LoadPage(&requests.FPDF_LoadPage{
15+
Document: doc,
16+
Index: pageIndex,
17+
})
18+
if err != nil {
19+
return "", fmt.Errorf("failed to load page: %v", err)
20+
}
21+
defer instance.FPDF_ClosePage(&requests.FPDF_ClosePage{
22+
Page: page.Page,
23+
})
24+
25+
// Load text from the page
26+
txt, err := instance.FPDFText_LoadPage(&requests.FPDFText_LoadPage{
27+
Page: requests.Page{
28+
ByReference: &page.Page,
29+
},
30+
})
31+
if err != nil {
32+
return "", fmt.Errorf("failed to load text page: %v", err)
33+
}
34+
defer instance.FPDFText_ClosePage(&requests.FPDFText_ClosePage{
35+
TextPage: txt.TextPage,
36+
})
37+
38+
// Count characters
39+
countChar, err := instance.FPDFText_CountChars(&requests.FPDFText_CountChars{
40+
TextPage: txt.TextPage,
41+
})
42+
if err != nil {
43+
return "", fmt.Errorf("failed to count characters: %v", err)
44+
}
45+
46+
// Extract text
47+
text, err := instance.FPDFText_GetText(&requests.FPDFText_GetText{
48+
TextPage: txt.TextPage,
49+
Count: countChar.Count,
50+
})
51+
if err != nil {
52+
return "", fmt.Errorf("failed to extract text: %v", err)
53+
}
54+
55+
return core.CleanTextWithPreservation(text.Text), nil
56+
}

internal/pdf/url_cleaner.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package pdf
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
"net/http"
7+
"time"
8+
9+
"github.com/Dcup-dev/Dcup-lib/internal/core"
10+
)
11+
12+
func (c Client) CleanUlr(url string, schema map[string]interface{}) (map[string]interface{}, error) {
13+
client := http.Client{
14+
Timeout: 10 * time.Second,
15+
}
16+
resp, err := client.Get(url)
17+
if err != nil {
18+
return nil, fmt.Errorf("failed to fetch pdf from URL: %v", err)
19+
}
20+
defer resp.Body.Close()
21+
22+
// Check if the URL returned a valid response
23+
if resp.StatusCode != http.StatusOK {
24+
return nil, fmt.Errorf("failed to fetch pdf: URL returned status code %d", resp.StatusCode)
25+
}
26+
27+
// Read the content into a buffer
28+
var buf bytes.Buffer
29+
if _, err := buf.ReadFrom(resp.Body); err != nil {
30+
return nil, fmt.Errorf("failed to read pdf content from URL: %v", err)
31+
}
32+
33+
// Parse the pdf data
34+
pdfBytes := buf.Bytes()
35+
pdfText, err := clean(pdfBytes)
36+
37+
if len(pdfText) == 0 {
38+
return nil, fmt.Errorf("no text found in the pdf")
39+
}
40+
return core.DataProcessing(c.config,core.CleanTextWithPreservation(pdfText), schema)
41+
}

0 commit comments

Comments
 (0)