-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from xxxsen/xxxsen/feature/more_scrape_source
Xxxsen/feature/more scrape source
- Loading branch information
Showing
13 changed files
with
291 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
package handler | ||
|
||
import ( | ||
"context" | ||
"regexp" | ||
"strings" | ||
"yamdc/model" | ||
) | ||
|
||
var ( | ||
defaultExtractActorRegexp = regexp.MustCompile(`\s*(.+?)\s*\(\s*(.+?)\s*\)`) | ||
) | ||
|
||
type actorSplitHandler struct { | ||
} | ||
|
||
func (h *actorSplitHandler) cleanActor(actor string) string { | ||
actor = strings.TrimSpace(actor) | ||
actor = strings.ReplaceAll(actor, "(", "(") | ||
actor = strings.ReplaceAll(actor, ")", ")") | ||
return actor | ||
} | ||
|
||
func (h *actorSplitHandler) tryExtractActor(actor string) ([]string, bool) { | ||
// 查找所有匹配的内容 | ||
matches := defaultExtractActorRegexp.FindAllStringSubmatch(actor, -1) | ||
|
||
if len(matches) == 0 { | ||
return nil, false | ||
} | ||
rs := make([]string, 0, 2) | ||
for _, match := range matches { | ||
if len(match) == 3 { // match[0] 是整个匹配的字符串,match[1] 和 match[2] 是捕获组 | ||
rs = append(rs, strings.TrimSpace(match[1]), strings.TrimSpace(match[2])) | ||
} | ||
} | ||
return rs, true | ||
} | ||
|
||
func (h *actorSplitHandler) Handle(ctx context.Context, fc *model.FileContext) error { | ||
//如果女优有括号, 尝试将其从括号中提取出来, example: 永野司 (永野つかさ) | ||
actorlist := make([]string, 0, len(fc.Meta.Actors)) | ||
for _, actor := range fc.Meta.Actors { | ||
actor = h.cleanActor(actor) | ||
splited, ok := h.tryExtractActor(actor) | ||
if !ok { | ||
actorlist = append(actorlist, actor) | ||
continue | ||
} | ||
actorlist = append(actorlist, splited...) | ||
} | ||
fc.Meta.Actors = actorlist | ||
return nil | ||
} | ||
|
||
func init() { | ||
Register(HActorSpliter, HandlerToCreator(&actorSplitHandler{})) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
package handler | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
"yamdc/model" | ||
|
||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestSplitActor(t *testing.T) { | ||
tests := []string{ | ||
"永野司 (永野つかさ)", | ||
"萨达(AA萨达)", | ||
} | ||
h := &actorSplitHandler{} | ||
in := &model.FileContext{ | ||
Meta: &model.AvMeta{ | ||
Actors: tests, | ||
}, | ||
} | ||
err := h.Handle(context.Background(), in) | ||
assert.NoError(t, err) | ||
t.Logf("read actor list:%+v", in.Meta.Actors) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,4 +13,6 @@ const ( | |
SS18AV = "18av" | ||
SSTKTube = "tktube" | ||
SSNJav = "njav" | ||
SSFc2PPVDB = "fc2ppvdb" | ||
SSMissav = "missav" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
package impl | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"net/http" | ||
"yamdc/model" | ||
"yamdc/number" | ||
"yamdc/searcher/decoder" | ||
"yamdc/searcher/parser" | ||
"yamdc/searcher/plugin/api" | ||
"yamdc/searcher/plugin/constant" | ||
"yamdc/searcher/plugin/factory" | ||
"yamdc/searcher/plugin/meta" | ||
"yamdc/searcher/utils" | ||
) | ||
|
||
var defaultFc2PPVDBDomains = []string{ | ||
"fc2ppvdb.com", | ||
} | ||
|
||
type fc2ppvdb struct { | ||
api.DefaultPlugin | ||
} | ||
|
||
func (p *fc2ppvdb) OnMakeHTTPRequest(ctx context.Context, nid *number.Number) (*http.Request, error) { | ||
vid, ok := number.DecodeFc2ValID(nid.GetNumberID()) | ||
if !ok { | ||
return nil, fmt.Errorf("unable to decode fc2 vid") | ||
} | ||
link := fmt.Sprintf("https://%s/articles/%s", api.MustSelectDomain(defaultFc2PPVDBDomains), vid) | ||
req, err := http.NewRequest(http.MethodGet, link, nil) | ||
if err != nil { | ||
return nil, err | ||
} | ||
return req, nil | ||
} | ||
|
||
func (p *fc2ppvdb) OnDecodeHTTPData(ctx context.Context, data []byte) (*model.AvMeta, bool, error) { | ||
dec := decoder.XPathHtmlDecoder{ | ||
NumberExpr: `//div[contains(text(), "ID")]/span/text()`, | ||
TitleExpr: `//div[@class="w-full lg:pl-8 px-2 lg:w-3/5"]/h2/a/text()`, | ||
PlotExpr: "", | ||
ActorListExpr: `//div[contains(text(), "女優")]/span/a/text()`, | ||
ReleaseDateExpr: `//div[contains(text(), "販売日")]/span/text()`, | ||
DurationExpr: `//div[contains(text(), "収録時間")]/span/text()`, | ||
StudioExpr: `//div[contains(text(), "販売者")]/span/a/text()`, | ||
LabelExpr: "", | ||
DirectorExpr: `//div[contains(text(), "販売者")]/span/a/text()`, | ||
SeriesExpr: "", | ||
GenreListExpr: `//div[contains(text(), "タグ")]/span/a/text()`, | ||
CoverExpr: `//div[@class="lg:w-2/5 w-full mb-12 md:mb-0"]/a/img/@src`, | ||
PosterExpr: `//div[@class="lg:w-2/5 w-full mb-12 md:mb-0"]/a/img/@src`, | ||
SampleImageListExpr: "", | ||
} | ||
mdata, err := dec.DecodeHTML(data, | ||
decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx)), | ||
decoder.WithDurationParser(parser.DefaultHHMMSSDurationParser(ctx)), | ||
) | ||
if err != nil { | ||
return nil, false, err | ||
} | ||
if len(mdata.Number) == 0 { | ||
return nil, false, nil | ||
} | ||
mdata.Number = meta.GetNumberId(ctx) | ||
utils.EnableDataTranslate(mdata) | ||
return mdata, true, nil | ||
} | ||
|
||
func init() { | ||
factory.Register(constant.SSFc2PPVDB, factory.PluginToCreator(&fc2ppvdb{})) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
package impl | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"net/http" | ||
"strings" | ||
"yamdc/model" | ||
"yamdc/number" | ||
"yamdc/searcher/decoder" | ||
"yamdc/searcher/parser" | ||
"yamdc/searcher/plugin/api" | ||
"yamdc/searcher/plugin/constant" | ||
"yamdc/searcher/plugin/factory" | ||
"yamdc/searcher/plugin/meta" | ||
"yamdc/searcher/plugin/twostep" | ||
) | ||
|
||
var ( | ||
defaultMissavDomains = []string{ | ||
"missav.ws", | ||
} | ||
) | ||
|
||
type missav struct { | ||
api.DefaultPlugin | ||
} | ||
|
||
func (p *missav) OnMakeHTTPRequest(ctx context.Context, number *number.Number) (*http.Request, error) { | ||
link := fmt.Sprintf("https://%s/cn/search/%s", api.MustSelectDomain(defaultMissavDomains), number.GetNumberID()) | ||
return http.NewRequest(http.MethodGet, link, nil) | ||
} | ||
|
||
func (p *missav) OnHandleHTTPRequest(ctx context.Context, invoker api.HTTPInvoker, req *http.Request) (*http.Response, error) { | ||
xctx := &twostep.XPathTwoStepContext{ | ||
Ps: []*twostep.XPathPair{ | ||
{ | ||
Name: "read-link", | ||
XPath: `//div[@class="my-2 text-sm text-nord4 truncate"]/a[@class="text-secondary group-hover:text-primary"]/@href`, | ||
}, | ||
{ | ||
Name: "read-title", | ||
XPath: `//div[@class="my-2 text-sm text-nord4 truncate"]/a[@class="text-secondary group-hover:text-primary"]/text()`, | ||
}, | ||
}, | ||
LinkSelector: func(ps []*twostep.XPathPair) (string, bool, error) { | ||
linkList := ps[0].Result | ||
titleList := ps[1].Result | ||
for i, link := range linkList { | ||
title := titleList[i] | ||
if strings.Contains(title, meta.GetNumberId(ctx)) { | ||
return link, true, nil | ||
} | ||
} | ||
return "", false, nil | ||
}, | ||
ValidStatusCode: []int{http.StatusOK}, | ||
CheckResultCountMatch: true, | ||
LinkPrefix: "", | ||
} | ||
return twostep.HandleXPathTwoStepSearch(ctx, invoker, req, xctx) | ||
} | ||
|
||
func (p *missav) OnDecodeHTTPData(ctx context.Context, data []byte) (*model.AvMeta, bool, error) { | ||
dec := decoder.XPathHtmlDecoder{ | ||
NumberExpr: `//div[span[contains(text(), "番号")]]/span[@class="font-medium"]/text()`, | ||
TitleExpr: `//div[@class="mt-4"]/h1[@class="text-base lg:text-lg text-nord6"]/text()`, | ||
PlotExpr: "", | ||
ActorListExpr: `//div[span[contains(text(), "女优")]]/a/text()`, | ||
ReleaseDateExpr: `//div[span[contains(text(), "发行日期")]]/time/text()`, | ||
DurationExpr: "", | ||
StudioExpr: `//div[span[contains(text(), "发行商")]]/a/text()`, | ||
LabelExpr: "", | ||
DirectorExpr: `//div[span[contains(text(), "导演")]]/a/text()`, | ||
SeriesExpr: "", | ||
GenreListExpr: `//div[span[contains(text(), "类型")]]/a/text()`, | ||
CoverExpr: `//link[@rel="preload" and @as="image"]/@href`, | ||
PosterExpr: "", | ||
SampleImageListExpr: "", | ||
} | ||
mdata, err := dec.DecodeHTML(data, decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx))) | ||
if err != nil { | ||
return nil, false, err | ||
} | ||
if len(mdata.Number) == 0 { | ||
return nil, false, err | ||
} | ||
return mdata, true, nil | ||
} | ||
|
||
func init() { | ||
factory.Register(constant.SSMissav, factory.PluginToCreator(&missav{})) | ||
} |