diff --git a/client/io.go b/client/io.go index 0a3b235..c9ebce4 100644 --- a/client/io.go +++ b/client/io.go @@ -5,6 +5,9 @@ import ( "compress/gzip" "io" "net/http" + + "github.com/klauspost/compress/zstd" + "github.com/xxxsen/common/iotool" ) func getResponseBody(rsp *http.Response) (io.ReadCloser, error) { @@ -13,6 +16,12 @@ func getResponseBody(rsp *http.Response) (io.ReadCloser, error) { return gzip.NewReader(rsp.Body) case "deflate": return flate.NewReader(rsp.Body), nil + case "zstd": + r, err := zstd.NewReader(rsp.Body) + if err != nil { + return nil, err + } + return iotool.WrapReadWriteCloser(r, nil, rsp.Body), nil default: return rsp.Body, nil } diff --git a/config/config.go b/config/config.go index d9b1056..2e2671b 100644 --- a/config/config.go +++ b/config/config.go @@ -48,18 +48,20 @@ func defaultConfig() *Config { "caribpr", "18av", "njav", + "missav", "freejavbt", "tktube", "avsox", }, CategoryPlugins: []CategoryPlugin{ //如果存在分配配置, 那么当番号被识别为特定分类的场景下, 将会使用分类插件直接查询 - {Name: "FC2", Plugins: []string{"fc2", "18av", "njav", "freejavbt", "tktube", "avsox"}}, + {Name: "FC2", Plugins: []string{"fc2", "18av", "njav", "freejavbt", "tktube", "avsox", "fc2ppvdb"}}, }, Handlers: []string{ "image_transcoder", "poster_cropper", "watermark_maker", + "actor_spliter", "tag_padder", "duration_fixer", "number_title", diff --git a/go.mod b/go.mod index c0561f6..d41e9a5 100644 --- a/go.mod +++ b/go.mod @@ -20,6 +20,8 @@ require ( golang.org/x/text v0.21.0 ) +require github.com/mitchellh/mapstructure v1.5.0 // indirect + require ( github.com/andybalholm/brotli v1.1.0 // indirect github.com/antchfx/xpath v1.3.0 // indirect @@ -32,7 +34,7 @@ require ( github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/kelseyhightower/envconfig v1.4.0 - github.com/klauspost/compress v1.17.9 // indirect + github.com/klauspost/compress v1.17.9 github.com/mattn/go-isatty v0.0.20 // indirect github.com/onsi/ginkgo/v2 v2.20.2 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/go.sum b/go.sum index f68cbc1..d8668ca 100644 --- a/go.sum +++ b/go.sum @@ -49,6 +49,8 @@ github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2 github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/onsi/ginkgo/v2 v2.20.2 h1:7NVCeyIWROIAheY21RLS+3j2bb52W0W82tkberYytp4= github.com/onsi/ginkgo/v2 v2.20.2/go.mod h1:K9gyxPIlb+aIvnZ8bd9Ak+YP18w3APlR+5coaZoE2ag= github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k= diff --git a/number/category.go b/number/category.go index 34a4729..d073020 100644 --- a/number/category.go +++ b/number/category.go @@ -19,8 +19,19 @@ func IsFc2(number string) bool { } func DetermineCategory(numberId string) Category { - if strings.HasPrefix(strings.ToUpper(numberId), "FC2") { + if IsFc2(numberId) { return CatFC2 } return CatDefault //默认无分类 } + +func DecodeFc2ValID(n string) (string, bool) { + if !IsFc2(n) { + return "", false + } + idx := strings.LastIndex(n, "-") + if idx < 0 { + return "", false + } + return n[idx+1:], true +} diff --git a/processor/constant.go b/processor/constant.go deleted file mode 100644 index cc1c5e6..0000000 --- a/processor/constant.go +++ /dev/null @@ -1,7 +0,0 @@ -package processor - -const ( - PsPosterCropper = "poster_cropper" - PsImageTranscoder = "image_transcoder" - PsPlotTranslater = "plot_translater" -) diff --git a/processor/handler/actor_split_handler.go b/processor/handler/actor_split_handler.go new file mode 100644 index 0000000..e2e56f3 --- /dev/null +++ b/processor/handler/actor_split_handler.go @@ -0,0 +1,58 @@ +package handler + +import ( + "context" + "regexp" + "strings" + "yamdc/model" +) + +var ( + defaultExtractActorRegexp = regexp.MustCompile(`\s*(.+?)\s*\(\s*(.+?)\s*\)`) +) + +type actorSplitHandler struct { +} + +func (h *actorSplitHandler) cleanActor(actor string) string { + actor = strings.TrimSpace(actor) + actor = strings.ReplaceAll(actor, "(", "(") + actor = strings.ReplaceAll(actor, ")", ")") + return actor +} + +func (h *actorSplitHandler) tryExtractActor(actor string) ([]string, bool) { + // 查找所有匹配的内容 + matches := defaultExtractActorRegexp.FindAllStringSubmatch(actor, -1) + + if len(matches) == 0 { + return nil, false + } + rs := make([]string, 0, 2) + for _, match := range matches { + if len(match) == 3 { // match[0] 是整个匹配的字符串,match[1] 和 match[2] 是捕获组 + rs = append(rs, strings.TrimSpace(match[1]), strings.TrimSpace(match[2])) + } + } + return rs, true +} + +func (h *actorSplitHandler) Handle(ctx context.Context, fc *model.FileContext) error { + //如果女优有括号, 尝试将其从括号中提取出来, example: 永野司 (永野つかさ) + actorlist := make([]string, 0, len(fc.Meta.Actors)) + for _, actor := range fc.Meta.Actors { + actor = h.cleanActor(actor) + splited, ok := h.tryExtractActor(actor) + if !ok { + actorlist = append(actorlist, actor) + continue + } + actorlist = append(actorlist, splited...) + } + fc.Meta.Actors = actorlist + return nil +} + +func init() { + Register(HActorSpliter, HandlerToCreator(&actorSplitHandler{})) +} diff --git a/processor/handler/actor_split_handler_test.go b/processor/handler/actor_split_handler_test.go new file mode 100644 index 0000000..267b1f4 --- /dev/null +++ b/processor/handler/actor_split_handler_test.go @@ -0,0 +1,25 @@ +package handler + +import ( + "context" + "testing" + "yamdc/model" + + "github.com/stretchr/testify/assert" +) + +func TestSplitActor(t *testing.T) { + tests := []string{ + "永野司 (永野つかさ)", + "萨达(AA萨达)", + } + h := &actorSplitHandler{} + in := &model.FileContext{ + Meta: &model.AvMeta{ + Actors: tests, + }, + } + err := h.Handle(context.Background(), in) + assert.NoError(t, err) + t.Logf("read actor list:%+v", in.Meta.Actors) +} diff --git a/processor/handler/constant.go b/processor/handler/constant.go index ed31ff8..8fbfd02 100644 --- a/processor/handler/constant.go +++ b/processor/handler/constant.go @@ -8,4 +8,5 @@ const ( HWatermakrMaker = "watermark_maker" HTagPadder = "tag_padder" HNumberTitle = "number_title" + HActorSpliter = "actor_spliter" ) diff --git a/searcher/parser/duration_parser.go b/searcher/parser/duration_parser.go index eae847b..19b0e8d 100644 --- a/searcher/parser/duration_parser.go +++ b/searcher/parser/duration_parser.go @@ -17,9 +17,18 @@ var ( defaultDurationRegexp = regexp.MustCompile(`\s*(\d+)\s*.+`) ) +func cleanTimeSequence(res string) []string { + list := strings.Split(res, ":") + rs := make([]string, 0, len(list)) + for _, item := range list { + rs = append(rs, strings.TrimSpace(item)) + } + return rs +} + func DefaultHHMMSSDurationParser(ctx context.Context) decoder.NumberParseFunc { return func(v string) int64 { - res := strings.Split(v, ":") + res := cleanTimeSequence(v) if len(res) > 3 { logutil.GetLogger(ctx).Error("invalid time format", zap.String("data", v)) return 0 diff --git a/searcher/plugin/constant/constant.go b/searcher/plugin/constant/constant.go index 1600bc7..c9070b4 100644 --- a/searcher/plugin/constant/constant.go +++ b/searcher/plugin/constant/constant.go @@ -13,4 +13,6 @@ const ( SS18AV = "18av" SSTKTube = "tktube" SSNJav = "njav" + SSFc2PPVDB = "fc2ppvdb" + SSMissav = "missav" ) diff --git a/searcher/plugin/impl/fc2ppvdb.go b/searcher/plugin/impl/fc2ppvdb.go new file mode 100644 index 0000000..0f83c21 --- /dev/null +++ b/searcher/plugin/impl/fc2ppvdb.go @@ -0,0 +1,73 @@ +package impl + +import ( + "context" + "fmt" + "net/http" + "yamdc/model" + "yamdc/number" + "yamdc/searcher/decoder" + "yamdc/searcher/parser" + "yamdc/searcher/plugin/api" + "yamdc/searcher/plugin/constant" + "yamdc/searcher/plugin/factory" + "yamdc/searcher/plugin/meta" + "yamdc/searcher/utils" +) + +var defaultFc2PPVDBDomains = []string{ + "fc2ppvdb.com", +} + +type fc2ppvdb struct { + api.DefaultPlugin +} + +func (p *fc2ppvdb) OnMakeHTTPRequest(ctx context.Context, nid *number.Number) (*http.Request, error) { + vid, ok := number.DecodeFc2ValID(nid.GetNumberID()) + if !ok { + return nil, fmt.Errorf("unable to decode fc2 vid") + } + link := fmt.Sprintf("https://%s/articles/%s", api.MustSelectDomain(defaultFc2PPVDBDomains), vid) + req, err := http.NewRequest(http.MethodGet, link, nil) + if err != nil { + return nil, err + } + return req, nil +} + +func (p *fc2ppvdb) OnDecodeHTTPData(ctx context.Context, data []byte) (*model.AvMeta, bool, error) { + dec := decoder.XPathHtmlDecoder{ + NumberExpr: `//div[contains(text(), "ID")]/span/text()`, + TitleExpr: `//div[@class="w-full lg:pl-8 px-2 lg:w-3/5"]/h2/a/text()`, + PlotExpr: "", + ActorListExpr: `//div[contains(text(), "女優")]/span/a/text()`, + ReleaseDateExpr: `//div[contains(text(), "販売日")]/span/text()`, + DurationExpr: `//div[contains(text(), "収録時間")]/span/text()`, + StudioExpr: `//div[contains(text(), "販売者")]/span/a/text()`, + LabelExpr: "", + DirectorExpr: `//div[contains(text(), "販売者")]/span/a/text()`, + SeriesExpr: "", + GenreListExpr: `//div[contains(text(), "タグ")]/span/a/text()`, + CoverExpr: `//div[@class="lg:w-2/5 w-full mb-12 md:mb-0"]/a/img/@src`, + PosterExpr: `//div[@class="lg:w-2/5 w-full mb-12 md:mb-0"]/a/img/@src`, + SampleImageListExpr: "", + } + mdata, err := dec.DecodeHTML(data, + decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx)), + decoder.WithDurationParser(parser.DefaultHHMMSSDurationParser(ctx)), + ) + if err != nil { + return nil, false, err + } + if len(mdata.Number) == 0 { + return nil, false, nil + } + mdata.Number = meta.GetNumberId(ctx) + utils.EnableDataTranslate(mdata) + return mdata, true, nil +} + +func init() { + factory.Register(constant.SSFc2PPVDB, factory.PluginToCreator(&fc2ppvdb{})) +} diff --git a/searcher/plugin/impl/missav.go b/searcher/plugin/impl/missav.go new file mode 100644 index 0000000..1ce6585 --- /dev/null +++ b/searcher/plugin/impl/missav.go @@ -0,0 +1,93 @@ +package impl + +import ( + "context" + "fmt" + "net/http" + "strings" + "yamdc/model" + "yamdc/number" + "yamdc/searcher/decoder" + "yamdc/searcher/parser" + "yamdc/searcher/plugin/api" + "yamdc/searcher/plugin/constant" + "yamdc/searcher/plugin/factory" + "yamdc/searcher/plugin/meta" + "yamdc/searcher/plugin/twostep" +) + +var ( + defaultMissavDomains = []string{ + "missav.ws", + } +) + +type missav struct { + api.DefaultPlugin +} + +func (p *missav) OnMakeHTTPRequest(ctx context.Context, number *number.Number) (*http.Request, error) { + link := fmt.Sprintf("https://%s/cn/search/%s", api.MustSelectDomain(defaultMissavDomains), number.GetNumberID()) + return http.NewRequest(http.MethodGet, link, nil) +} + +func (p *missav) OnHandleHTTPRequest(ctx context.Context, invoker api.HTTPInvoker, req *http.Request) (*http.Response, error) { + xctx := &twostep.XPathTwoStepContext{ + Ps: []*twostep.XPathPair{ + { + Name: "read-link", + XPath: `//div[@class="my-2 text-sm text-nord4 truncate"]/a[@class="text-secondary group-hover:text-primary"]/@href`, + }, + { + Name: "read-title", + XPath: `//div[@class="my-2 text-sm text-nord4 truncate"]/a[@class="text-secondary group-hover:text-primary"]/text()`, + }, + }, + LinkSelector: func(ps []*twostep.XPathPair) (string, bool, error) { + linkList := ps[0].Result + titleList := ps[1].Result + for i, link := range linkList { + title := titleList[i] + if strings.Contains(title, meta.GetNumberId(ctx)) { + return link, true, nil + } + } + return "", false, nil + }, + ValidStatusCode: []int{http.StatusOK}, + CheckResultCountMatch: true, + LinkPrefix: "", + } + return twostep.HandleXPathTwoStepSearch(ctx, invoker, req, xctx) +} + +func (p *missav) OnDecodeHTTPData(ctx context.Context, data []byte) (*model.AvMeta, bool, error) { + dec := decoder.XPathHtmlDecoder{ + NumberExpr: `//div[span[contains(text(), "番号")]]/span[@class="font-medium"]/text()`, + TitleExpr: `//div[@class="mt-4"]/h1[@class="text-base lg:text-lg text-nord6"]/text()`, + PlotExpr: "", + ActorListExpr: `//div[span[contains(text(), "女优")]]/a/text()`, + ReleaseDateExpr: `//div[span[contains(text(), "发行日期")]]/time/text()`, + DurationExpr: "", + StudioExpr: `//div[span[contains(text(), "发行商")]]/a/text()`, + LabelExpr: "", + DirectorExpr: `//div[span[contains(text(), "导演")]]/a/text()`, + SeriesExpr: "", + GenreListExpr: `//div[span[contains(text(), "类型")]]/a/text()`, + CoverExpr: `//link[@rel="preload" and @as="image"]/@href`, + PosterExpr: "", + SampleImageListExpr: "", + } + mdata, err := dec.DecodeHTML(data, decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx))) + if err != nil { + return nil, false, err + } + if len(mdata.Number) == 0 { + return nil, false, err + } + return mdata, true, nil +} + +func init() { + factory.Register(constant.SSMissav, factory.PluginToCreator(&missav{})) +}