Skip to content

Commit

Permalink
重写书名处理逻辑
Browse files Browse the repository at this point in the history
  • Loading branch information
jianyun8023 committed Dec 14, 2024
1 parent 46a5b19 commit c0e56de
Show file tree
Hide file tree
Showing 5 changed files with 338 additions and 21 deletions.
2 changes: 1 addition & 1 deletion cmd/clname.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ func ParseEpub(file string, c *ClnameConfig) error {
return fmt.Errorf("无法获得书籍标题")
}
title := book.Opf.Metadata.Title[0]
newTitle := util.CleanTitle(title)
newTitle := util.TryCleanTitle(title)
if title == newTitle {
return nil
}
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ go 1.18

require (
github.com/kapmahc/epub v0.1.1
github.com/spf13/cobra v1.6.1
github.com/spf13/cobra v1.8.1
)

require (
github.com/inconshreveable/mousetrap v1.0.1 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
)
10 changes: 5 additions & 5 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/inconshreveable/mousetrap v1.0.1 h1:U3uMjPSQEBMNp1lFxmllqCPM6P5u/Xq7Pgzkat/bFNc=
github.com/inconshreveable/mousetrap v1.0.1/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/kapmahc/epub v0.1.1 h1:a4fgmhh/q2vyzFR2QXOVohR2zAuQvbacCjMZ1LGr0lw=
github.com/kapmahc/epub v0.1.1/go.mod h1:UpnUbQO78vpmp6TC4emDTAIG6XVcdnZTnaTx06qbtYM=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/spf13/cobra v1.6.1 h1:o94oiPyS4KD1mPy2fmcYYHHfCxLqYjJOhGsCHFZtEzA=
github.com/spf13/cobra v1.6.1/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY=
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
Expand Down
237 changes: 237 additions & 0 deletions pkg/util/cleanname.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package util
import (
"regexp"
"strings"
"unicode/utf8"
)

var (
Expand All @@ -13,7 +14,243 @@ func CleanTitle(title string) string {
if len(ReNameReg.FindAllString(title, -1)) == 0 {
return title
}

for _, match := range ReNameReg.FindAllStringSubmatch(title, -1) {
if len(match[0]) < 10 {
println(title + "--------" + match[0])
return title
}
}
newTitle := ReNameReg.ReplaceAllString(title, "")
newTitle = strings.TrimSpace(strings.ReplaceAll(newTitle, "\"", " "))
return newTitle
}

type Stack struct {
items []string
}

func (s *Stack) Push(item string) {
s.items = append(s.items, item)
}

func (s *Stack) Pop() string {
if len(s.items) == 0 {
return ""
}
item := s.items[len(s.items)-1]
s.items = s.items[:len(s.items)-1]
return item
}

func (s *Stack) IsEmpty() bool {
return len(s.items) == 0
}

func (s *Stack) Peek() string {
if len(s.items) == 0 {
return ""
}
return s.items[len(s.items)-1]
}

func (s *Stack) GetItems() []string {
return s.items
}

func TryCleanTitle(title string) string {

stack := &Stack{}

//fmt.Println(title)
reader := strings.NewReader(title)

word := ""
symbol := &Stack{}
symbolPair := map[string]string{
"【": "】",
"[": "]",
"(": ")",
"(": ")",
"】": "【",
"]": "[",
")": "(",
")": "(",
}
for {
r, _, err := reader.ReadRune()
if err != nil {
break
}
char := string(r)
//《大江大河》作者阿耐合集(共12册)(含《大江大河》(全4册)《欢乐颂》(全3册)《都挺好》(全2册)《不得往生》《食荤者》《余生》,阿耐出品,必是精品!作品改编影视剧均引起热议!)
//fmt.Println("【needSymbol】 " + strings.Join(symbol.GetItems(), " | "))
switch char {
case "【", "[", "(", "(":
if symbol.IsEmpty() && utf8.RuneCountInString(word) > 0 {
stack.Push(word)
word = char
} else {
word += char
}
symbol.Push(char)
case "】", "]", ")", ")":
word += char
if symbol.Peek() == symbolPair[char] {
symbol.Pop()
stack.Push(word)
word = ""
}
default:
word += char
}

//fmt.Println(word, "----", strings.Join(stack.GetItems(), " | "))
}

if symbol.IsEmpty() || len(word) != 0 {
stack.Push(word)
}

outTitle := ""
for i, v := range stack.GetItems() {
if i == 0 {
outTitle = v
} else if i <= 2 && preserve(v) {
outTitle += v
}
}

// 去除首尾空格
outTitle = strings.ReplaceAll(outTitle, "\"", " ")
outTitle = strings.TrimSpace(outTitle)
if utf8.RuneCountInString(outTitle) == 0 {
return title
}
return outTitle
}

func preserve(content string) bool {

c := strings.TrimPrefix(content, "【")
c = strings.TrimPrefix(c, "[")
c = strings.TrimPrefix(c, "(")
c = strings.TrimPrefix(c, "(")

c = strings.TrimSuffix(c, "】")
c = strings.TrimSuffix(c, "]")
c = strings.TrimSuffix(c, ")")
c = strings.TrimSuffix(c, ")")

if utf8.RuneCountInString(c) <= 3 {
return true
}
// 定义需要保留的括号内容的正则表达式
preservePatterns := []string{
`.{2,6}篇`,
`[上中下+]`,
`[上中下、]+[册本卷部辑]`,
`套装.*?[册本卷部辑]`,
`[全共].*?[册本卷部辑]`,
`\d+[册本卷部辑]`,
`第.*?[版卷部辑]`,
`[\d一二三四五六七八九十百千]+[-~—~][\d一二三四五六七八九十百千]+`,
`\d{4}[-~—~]\d{4}`,
}
// 合并保留模式为一个正则表达式
preserveRegex := regexp.MustCompile(strings.Join(preservePatterns, "|"))
b := preserveRegex.MatchString(c) && utf8.RuneCountInString(c) < 20
if !b {
return strings.HasSuffix(c, "版") && utf8.RuneCountInString(c) < 10
}
return b
}

func NewCleanTitle(title string) string {
// 移除方括号 【】和 []
reSquareBrackets := regexp.MustCompile(`[【\[].*?[】\]]`)
title = reSquareBrackets.ReplaceAllString(title, "")

// 定义需要保留的括号内容的正则表达式
preservePatterns := []string{
`.{2,6}篇`,
`修订版`,
`[上中下、]+[册本卷部]`,
`套装.*?[册本卷部]`,
`[全共].*?[册本卷部]`,
`第.*?[版卷部]`,
`[\d一二三四五六七八九十百千]+[-~—~][\d一二三四五六七八九十百千]+`,
`\d{4}[-~—~]\d{4}`,
}
// 合并保留模式为一个正则表达式
preserveRegex := regexp.MustCompile(strings.Join(preservePatterns, "|"))

// 处理中文括号 ()
reChineseParentheses := regexp.MustCompile(`(.*?)`)
title = reChineseParentheses.ReplaceAllStringFunc(title, func(s string) string {
content := strings.Trim(s, "()")
println(content)
// 如果内容长度小于8,保留
if len([]rune(content)) < 8 {
return s
}
if preserveRegex.MatchString(content) {
return s
}
return ""
})

// 处理英文括号 ()
reEnglishParentheses := regexp.MustCompile(`\(.*?\)`)
title = reEnglishParentheses.ReplaceAllStringFunc(title, func(s string) string {
content := strings.Trim(s, "()")
println(content)
// 如果内容长度小于8,保留
if len([]rune(content)) < 8 {
return s
}
if preserveRegex.MatchString(content) {
return s
}
return ""
})

// 处理中文英文括号 ()
reChinese2Parentheses := regexp.MustCompile(`(.*?\)`)
title = reChinese2Parentheses.ReplaceAllStringFunc(title, func(s string) string {
content := strings.Trim(s, "()")
println(content)
// 如果内容长度小于8,保留
if len([]rune(content)) < 8 {
return s
}
if preserveRegex.MatchString(content) {
return s
}
return ""
})
// 处理英文中文括号 ()
reEnglish2Parentheses := regexp.MustCompile(`\(.*?)`)
title = reEnglish2Parentheses.ReplaceAllStringFunc(title, func(s string) string {
content := strings.Trim(s, "()")
println(content)
// 如果内容长度小于8,保留
if len([]rune(content)) < 8 {
return s
}
if preserveRegex.MatchString(content) {
return s
}
return ""
})

// 移除书名号 《和》
if strings.HasPrefix(title, "《") && strings.HasSuffix(title, "》") {
title = title[3:]
title = title[:len(title)-3]
}
// 去除首尾空格
title = strings.ReplaceAll(title, "\"", " ")
title = strings.TrimSpace(title)
return title
}
Loading

0 comments on commit c0e56de

Please sign in to comment.