-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[boardgames] introducing a background mechanism for scraping websites…
… using queues
- Loading branch information
1 parent
a80508e
commit daa3125
Showing
13 changed files
with
968 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
package main | ||
|
||
import ( | ||
"encoding/json" | ||
"log" | ||
"time" | ||
|
||
"github.com/DictumMortuum/servus-extapi/pkg/db" | ||
"github.com/DictumMortuum/servus-extapi/pkg/scrape" | ||
"github.com/adjust/rmq/v5" | ||
"github.com/jmoiron/sqlx" | ||
) | ||
|
||
func setUrlToScraped(DB *sqlx.DB, payload map[string]any) error { | ||
_, err := DB.NamedExec(` | ||
update | ||
tscrapeurl | ||
set | ||
last_scraped = :scraped, | ||
last_instock = :instock, | ||
last_preorder = :preorder, | ||
last_outofstock = :outofstock, | ||
last_pages = :pages_visited | ||
where | ||
id = :id | ||
`, payload) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func consumeFn(task scrape.GenericScrapeRequest) error { | ||
DB, err := db.DatabaseX() | ||
if err != nil { | ||
return err | ||
} | ||
defer DB.Close() | ||
|
||
sc, err := getScrape(DB, task.ScrapeUrl.ScrapeId) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
err = scrape.Stale(DB, sc.StoreId) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
payload, _, err := scrape.GenericScrape(*sc, DB, task) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
err = scrape.Cleanup(DB, sc.StoreId) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
log.Println(task.ListOnly, payload) | ||
err = setUrlToScraped(DB, payload) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func Consumer(conn rmq.Connection) { | ||
queue, err := conn.OpenQueue("scrape") | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
err = queue.StartConsuming(10, time.Second) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
_, err = queue.AddConsumerFunc("scraper", func(d rmq.Delivery) { | ||
var task scrape.GenericScrapeRequest | ||
err = json.Unmarshal([]byte(d.Payload()), &task) | ||
if err != nil { | ||
d.Reject() | ||
} | ||
|
||
err = consumeFn(task) | ||
if err != nil { | ||
log.Println(err) | ||
d.Reject() | ||
} | ||
|
||
d.Ack() | ||
}) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,222 @@ | ||
package main | ||
|
||
import ( | ||
"encoding/json" | ||
|
||
"github.com/DictumMortuum/servus-extapi/pkg/model" | ||
"github.com/DictumMortuum/servus-extapi/pkg/queries" | ||
"github.com/DictumMortuum/servus-extapi/pkg/scrape" | ||
"github.com/jmoiron/sqlx" | ||
) | ||
|
||
func getScrape(DB *sqlx.DB, id int64) (*model.Scrape, error) { | ||
sc := model.Scrape{} | ||
err := DB.Get(&sc, ` | ||
select | ||
* | ||
from | ||
tscrape | ||
where | ||
id = ? | ||
`, id) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return &sc, nil | ||
} | ||
|
||
func getScrapeUrl(DB *sqlx.DB, id int64) (*model.ScrapeUrl, error) { | ||
u := model.ScrapeUrl{} | ||
err := DB.Get(&u, ` | ||
select | ||
* | ||
from | ||
tscrapeurl | ||
where | ||
id = ? | ||
`, id) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return &u, nil | ||
} | ||
|
||
func getScrapeUrls(DB *sqlx.DB, id int64) ([]model.ScrapeUrl, error) { | ||
u := []model.ScrapeUrl{} | ||
err := DB.Select(&u, ` | ||
select | ||
* | ||
from | ||
tscrapeurl | ||
where | ||
scrape_id = ? | ||
`, id) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return u, nil | ||
} | ||
|
||
func setUrlToPending(DB *sqlx.DB, id int64) error { | ||
_, err := DB.Exec(` | ||
update | ||
tscrapeurl | ||
set | ||
last_scraped = NULL, | ||
last_instock = NULL, | ||
last_preorder = NULL, | ||
last_outofstock = NULL, | ||
last_pages = NULL | ||
where | ||
id = ? | ||
`, id) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func scrapeSingle(req *model.Map, r *scrape.GenericScrapeRequest) error { | ||
conn, err := req.GetRmq() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
DB, err := req.GetDB() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
scrape, err := conn.OpenQueue("scrape") | ||
if err != nil { | ||
return err | ||
} | ||
|
||
raw, err := json.Marshal(r) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
err = scrape.Publish(string(raw)) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
err = setUrlToPending(DB, r.ScrapeUrl.Id) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} | ||
|
||
type scrapeBody struct { | ||
ListOnly bool `json:"list_only"` | ||
} | ||
|
||
func ScrapeUrl(req *model.Map, res *model.Map) error { | ||
id, err := req.GetInt64("id") | ||
if err != nil { | ||
return err | ||
} | ||
|
||
DB, err := req.GetDB() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
body, err := req.GetByte("body") | ||
if err != nil { | ||
return err | ||
} | ||
|
||
var payload scrapeBody | ||
err = json.Unmarshal(body, &payload) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
u, err := getScrapeUrl(DB, id) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
cfg, err := queries.GetConfig(DB, "SCRAPE_CACHE") | ||
if err != nil { | ||
return err | ||
} | ||
|
||
r := scrape.GenericScrapeRequest{ | ||
ScrapeUrl: *u, | ||
Cache: cfg.Value, | ||
ListOnly: payload.ListOnly, | ||
} | ||
|
||
err = scrapeSingle(req, &r) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
res.SetInternal(map[string]any{ | ||
"req": r, | ||
}) | ||
|
||
return nil | ||
} | ||
|
||
func ScrapeStore(req *model.Map, res *model.Map) error { | ||
id, err := req.GetInt64("id") | ||
if err != nil { | ||
return err | ||
} | ||
|
||
DB, err := req.GetDB() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
body, err := req.GetByte("body") | ||
if err != nil { | ||
return err | ||
} | ||
|
||
var payload scrapeBody | ||
err = json.Unmarshal(body, &payload) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
sc, err := getScrape(DB, id) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
u, err := getScrapeUrls(DB, sc.Id) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
cfg, err := queries.GetConfig(DB, "SCRAPE_CACHE") | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for _, url := range u { | ||
r := scrape.GenericScrapeRequest{ | ||
ScrapeUrl: url, | ||
Cache: cfg.Value, | ||
ListOnly: payload.ListOnly, | ||
} | ||
|
||
err = scrapeSingle(req, &r) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
|
||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.