Skip to content

Commit

Permalink
Merge pull request #12 from xxxsen/xxxsen/feature/more_scrape_source
Browse files Browse the repository at this point in the history
Xxxsen/feature/more scrape source
  • Loading branch information
xxxsen authored Jan 10, 2025
2 parents 366645e + 71b429c commit 38aeb3c
Show file tree
Hide file tree
Showing 13 changed files with 291 additions and 11 deletions.
9 changes: 9 additions & 0 deletions client/io.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ import (
"compress/gzip"
"io"
"net/http"

"github.com/klauspost/compress/zstd"
"github.com/xxxsen/common/iotool"
)

func getResponseBody(rsp *http.Response) (io.ReadCloser, error) {
Expand All @@ -13,6 +16,12 @@ func getResponseBody(rsp *http.Response) (io.ReadCloser, error) {
return gzip.NewReader(rsp.Body)
case "deflate":
return flate.NewReader(rsp.Body), nil
case "zstd":
r, err := zstd.NewReader(rsp.Body)
if err != nil {
return nil, err
}
return iotool.WrapReadWriteCloser(r, nil, rsp.Body), nil
default:
return rsp.Body, nil
}
Expand Down
4 changes: 3 additions & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,20 @@ func defaultConfig() *Config {
"caribpr",
"18av",
"njav",
"missav",
"freejavbt",
"tktube",
"avsox",
},
CategoryPlugins: []CategoryPlugin{
//如果存在分配配置, 那么当番号被识别为特定分类的场景下, 将会使用分类插件直接查询
{Name: "FC2", Plugins: []string{"fc2", "18av", "njav", "freejavbt", "tktube", "avsox"}},
{Name: "FC2", Plugins: []string{"fc2", "18av", "njav", "freejavbt", "tktube", "avsox", "fc2ppvdb"}},
},
Handlers: []string{
"image_transcoder",
"poster_cropper",
"watermark_maker",
"actor_spliter",
"tag_padder",
"duration_fixer",
"number_title",
Expand Down
4 changes: 3 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ require (
golang.org/x/text v0.21.0
)

require github.com/mitchellh/mapstructure v1.5.0 // indirect

require (
github.com/andybalholm/brotli v1.1.0 // indirect
github.com/antchfx/xpath v1.3.0 // indirect
Expand All @@ -32,7 +34,7 @@ require (
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/kelseyhightower/envconfig v1.4.0
github.com/klauspost/compress v1.17.9 // indirect
github.com/klauspost/compress v1.17.9
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/onsi/ginkgo/v2 v2.20.2 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2
github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
github.com/onsi/ginkgo/v2 v2.20.2 h1:7NVCeyIWROIAheY21RLS+3j2bb52W0W82tkberYytp4=
github.com/onsi/ginkgo/v2 v2.20.2/go.mod h1:K9gyxPIlb+aIvnZ8bd9Ak+YP18w3APlR+5coaZoE2ag=
github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k=
Expand Down
13 changes: 12 additions & 1 deletion number/category.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,19 @@ func IsFc2(number string) bool {
}

func DetermineCategory(numberId string) Category {
if strings.HasPrefix(strings.ToUpper(numberId), "FC2") {
if IsFc2(numberId) {
return CatFC2
}
return CatDefault //默认无分类
}

func DecodeFc2ValID(n string) (string, bool) {
if !IsFc2(n) {
return "", false
}
idx := strings.LastIndex(n, "-")
if idx < 0 {
return "", false
}
return n[idx+1:], true
}
7 changes: 0 additions & 7 deletions processor/constant.go

This file was deleted.

58 changes: 58 additions & 0 deletions processor/handler/actor_split_handler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package handler

import (
"context"
"regexp"
"strings"
"yamdc/model"
)

var (
defaultExtractActorRegexp = regexp.MustCompile(`\s*(.+?)\s*\(\s*(.+?)\s*\)`)
)

type actorSplitHandler struct {
}

func (h *actorSplitHandler) cleanActor(actor string) string {
actor = strings.TrimSpace(actor)
actor = strings.ReplaceAll(actor, "(", "(")
actor = strings.ReplaceAll(actor, ")", ")")
return actor
}

func (h *actorSplitHandler) tryExtractActor(actor string) ([]string, bool) {
// 查找所有匹配的内容
matches := defaultExtractActorRegexp.FindAllStringSubmatch(actor, -1)

if len(matches) == 0 {
return nil, false
}
rs := make([]string, 0, 2)
for _, match := range matches {
if len(match) == 3 { // match[0] 是整个匹配的字符串,match[1] 和 match[2] 是捕获组
rs = append(rs, strings.TrimSpace(match[1]), strings.TrimSpace(match[2]))
}
}
return rs, true
}

func (h *actorSplitHandler) Handle(ctx context.Context, fc *model.FileContext) error {
//如果女优有括号, 尝试将其从括号中提取出来, example: 永野司 (永野つかさ)
actorlist := make([]string, 0, len(fc.Meta.Actors))
for _, actor := range fc.Meta.Actors {
actor = h.cleanActor(actor)
splited, ok := h.tryExtractActor(actor)
if !ok {
actorlist = append(actorlist, actor)
continue
}
actorlist = append(actorlist, splited...)
}
fc.Meta.Actors = actorlist
return nil
}

func init() {
Register(HActorSpliter, HandlerToCreator(&actorSplitHandler{}))
}
25 changes: 25 additions & 0 deletions processor/handler/actor_split_handler_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package handler

import (
"context"
"testing"
"yamdc/model"

"github.com/stretchr/testify/assert"
)

func TestSplitActor(t *testing.T) {
tests := []string{
"永野司 (永野つかさ)",
"萨达(AA萨达)",
}
h := &actorSplitHandler{}
in := &model.FileContext{
Meta: &model.AvMeta{
Actors: tests,
},
}
err := h.Handle(context.Background(), in)
assert.NoError(t, err)
t.Logf("read actor list:%+v", in.Meta.Actors)
}
1 change: 1 addition & 0 deletions processor/handler/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ const (
HWatermakrMaker = "watermark_maker"
HTagPadder = "tag_padder"
HNumberTitle = "number_title"
HActorSpliter = "actor_spliter"
)
11 changes: 10 additions & 1 deletion searcher/parser/duration_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,18 @@ var (
defaultDurationRegexp = regexp.MustCompile(`\s*(\d+)\s*.+`)
)

func cleanTimeSequence(res string) []string {
list := strings.Split(res, ":")
rs := make([]string, 0, len(list))
for _, item := range list {
rs = append(rs, strings.TrimSpace(item))
}
return rs
}

func DefaultHHMMSSDurationParser(ctx context.Context) decoder.NumberParseFunc {
return func(v string) int64 {
res := strings.Split(v, ":")
res := cleanTimeSequence(v)
if len(res) > 3 {
logutil.GetLogger(ctx).Error("invalid time format", zap.String("data", v))
return 0
Expand Down
2 changes: 2 additions & 0 deletions searcher/plugin/constant/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,6 @@ const (
SS18AV = "18av"
SSTKTube = "tktube"
SSNJav = "njav"
SSFc2PPVDB = "fc2ppvdb"
SSMissav = "missav"
)
73 changes: 73 additions & 0 deletions searcher/plugin/impl/fc2ppvdb.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package impl

import (
"context"
"fmt"
"net/http"
"yamdc/model"
"yamdc/number"
"yamdc/searcher/decoder"
"yamdc/searcher/parser"
"yamdc/searcher/plugin/api"
"yamdc/searcher/plugin/constant"
"yamdc/searcher/plugin/factory"
"yamdc/searcher/plugin/meta"
"yamdc/searcher/utils"
)

var defaultFc2PPVDBDomains = []string{
"fc2ppvdb.com",
}

type fc2ppvdb struct {
api.DefaultPlugin
}

func (p *fc2ppvdb) OnMakeHTTPRequest(ctx context.Context, nid *number.Number) (*http.Request, error) {
vid, ok := number.DecodeFc2ValID(nid.GetNumberID())
if !ok {
return nil, fmt.Errorf("unable to decode fc2 vid")
}
link := fmt.Sprintf("https://%s/articles/%s", api.MustSelectDomain(defaultFc2PPVDBDomains), vid)
req, err := http.NewRequest(http.MethodGet, link, nil)
if err != nil {
return nil, err
}
return req, nil
}

func (p *fc2ppvdb) OnDecodeHTTPData(ctx context.Context, data []byte) (*model.AvMeta, bool, error) {
dec := decoder.XPathHtmlDecoder{
NumberExpr: `//div[contains(text(), "ID")]/span/text()`,
TitleExpr: `//div[@class="w-full lg:pl-8 px-2 lg:w-3/5"]/h2/a/text()`,
PlotExpr: "",
ActorListExpr: `//div[contains(text(), "女優")]/span/a/text()`,
ReleaseDateExpr: `//div[contains(text(), "販売日")]/span/text()`,
DurationExpr: `//div[contains(text(), "収録時間")]/span/text()`,
StudioExpr: `//div[contains(text(), "販売者")]/span/a/text()`,
LabelExpr: "",
DirectorExpr: `//div[contains(text(), "販売者")]/span/a/text()`,
SeriesExpr: "",
GenreListExpr: `//div[contains(text(), "タグ")]/span/a/text()`,
CoverExpr: `//div[@class="lg:w-2/5 w-full mb-12 md:mb-0"]/a/img/@src`,
PosterExpr: `//div[@class="lg:w-2/5 w-full mb-12 md:mb-0"]/a/img/@src`,
SampleImageListExpr: "",
}
mdata, err := dec.DecodeHTML(data,
decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx)),
decoder.WithDurationParser(parser.DefaultHHMMSSDurationParser(ctx)),
)
if err != nil {
return nil, false, err
}
if len(mdata.Number) == 0 {
return nil, false, nil
}
mdata.Number = meta.GetNumberId(ctx)
utils.EnableDataTranslate(mdata)
return mdata, true, nil
}

func init() {
factory.Register(constant.SSFc2PPVDB, factory.PluginToCreator(&fc2ppvdb{}))
}
93 changes: 93 additions & 0 deletions searcher/plugin/impl/missav.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package impl

import (
"context"
"fmt"
"net/http"
"strings"
"yamdc/model"
"yamdc/number"
"yamdc/searcher/decoder"
"yamdc/searcher/parser"
"yamdc/searcher/plugin/api"
"yamdc/searcher/plugin/constant"
"yamdc/searcher/plugin/factory"
"yamdc/searcher/plugin/meta"
"yamdc/searcher/plugin/twostep"
)

var (
defaultMissavDomains = []string{
"missav.ws",
}
)

type missav struct {
api.DefaultPlugin
}

func (p *missav) OnMakeHTTPRequest(ctx context.Context, number *number.Number) (*http.Request, error) {
link := fmt.Sprintf("https://%s/cn/search/%s", api.MustSelectDomain(defaultMissavDomains), number.GetNumberID())
return http.NewRequest(http.MethodGet, link, nil)
}

func (p *missav) OnHandleHTTPRequest(ctx context.Context, invoker api.HTTPInvoker, req *http.Request) (*http.Response, error) {
xctx := &twostep.XPathTwoStepContext{
Ps: []*twostep.XPathPair{
{
Name: "read-link",
XPath: `//div[@class="my-2 text-sm text-nord4 truncate"]/a[@class="text-secondary group-hover:text-primary"]/@href`,
},
{
Name: "read-title",
XPath: `//div[@class="my-2 text-sm text-nord4 truncate"]/a[@class="text-secondary group-hover:text-primary"]/text()`,
},
},
LinkSelector: func(ps []*twostep.XPathPair) (string, bool, error) {
linkList := ps[0].Result
titleList := ps[1].Result
for i, link := range linkList {
title := titleList[i]
if strings.Contains(title, meta.GetNumberId(ctx)) {
return link, true, nil
}
}
return "", false, nil
},
ValidStatusCode: []int{http.StatusOK},
CheckResultCountMatch: true,
LinkPrefix: "",
}
return twostep.HandleXPathTwoStepSearch(ctx, invoker, req, xctx)
}

func (p *missav) OnDecodeHTTPData(ctx context.Context, data []byte) (*model.AvMeta, bool, error) {
dec := decoder.XPathHtmlDecoder{
NumberExpr: `//div[span[contains(text(), "番号")]]/span[@class="font-medium"]/text()`,
TitleExpr: `//div[@class="mt-4"]/h1[@class="text-base lg:text-lg text-nord6"]/text()`,
PlotExpr: "",
ActorListExpr: `//div[span[contains(text(), "女优")]]/a/text()`,
ReleaseDateExpr: `//div[span[contains(text(), "发行日期")]]/time/text()`,
DurationExpr: "",
StudioExpr: `//div[span[contains(text(), "发行商")]]/a/text()`,
LabelExpr: "",
DirectorExpr: `//div[span[contains(text(), "导演")]]/a/text()`,
SeriesExpr: "",
GenreListExpr: `//div[span[contains(text(), "类型")]]/a/text()`,
CoverExpr: `//link[@rel="preload" and @as="image"]/@href`,
PosterExpr: "",
SampleImageListExpr: "",
}
mdata, err := dec.DecodeHTML(data, decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx)))
if err != nil {
return nil, false, err
}
if len(mdata.Number) == 0 {
return nil, false, err
}
return mdata, true, nil
}

func init() {
factory.Register(constant.SSMissav, factory.PluginToCreator(&missav{}))
}

0 comments on commit 38aeb3c

Please sign in to comment.