From f3652d06e2eccb47c0b174d200644d1dc1c3788c Mon Sep 17 00:00:00 2001 From: TakenPt Date: Tue, 5 Mar 2024 21:15:38 +0900 Subject: [PATCH 1/5] wip --- .../KoeBook.Epub/Services/ScrapingNaroService.cs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs index a1e00ec..87c7d2e 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs @@ -66,7 +66,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } var document = new EpubDocument(bookTitle.InnerHtml, bookAuther.InnerHtml, coverFilePath, id); - if (isRensai) + if (isRensai) // 連載の時 { List SectionWithChapterTitleList = new List(); for (int i = 1; i <= allNum; i++) @@ -93,6 +93,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } else { + document.Chapters[^1].Sections.Add(sectionWithChapterTitle.section); } } @@ -106,7 +107,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } } } - else + else // 短編の時 { var load = await ReadPageAsync(url, isRensai, imageDirectory, ct).ConfigureAwait(false); if (load != null) @@ -128,13 +129,16 @@ private async Task ReadPageAsync(string url, bool isRen using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); - var chapterTitleElement = doc.QuerySelector(".chapter_title"); string? chapterTitle = null; - if (chapterTitleElement != null) + if (!isRensai) { - if (chapterTitleElement.InnerHtml != null) + var chapterTitleElement = doc.QuerySelector(".chapter_title"); + if (chapterTitleElement != null) { - chapterTitle = chapterTitleElement.InnerHtml; + if (chapterTitleElement.InnerHtml != null) + { + chapterTitle = chapterTitleElement.InnerHtml; + } } } From 16a7f932d7d90fee59728292e86a2552968633e3 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Wed, 6 Mar 2024 14:19:17 +0900 Subject: [PATCH 2/5] =?UTF-8?q?=E4=BC=9A=E8=A9=B1=E6=96=87=E3=81=AE?= =?UTF-8?q?=E9=96=8B=E5=A7=8B=E3=80=81=E7=B5=82=E4=BA=86=E8=A8=98=E5=8F=B7?= =?UTF-8?q?=E3=81=AB=E3=80=8E=E5=8F=8A=E3=81=B3=E3=80=8F=E3=82=92=E8=AA=8D?= =?UTF-8?q?=E3=82=81=E3=82=8B=E3=82=88=E3=81=86=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingNaroService.cs | 6 +-- Epub/KoeBook.Epub/Utility/ScrapingHelper.cs | 10 ++-- KoeBook.Test/Epub/ScrapingHelperTest.cs | 47 ++++++++++++++++++- 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs index 87c7d2e..ba33b8a 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs @@ -71,8 +71,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP List SectionWithChapterTitleList = new List(); for (int i = 1; i <= allNum; i++) { - Console.WriteLine(i); - await Task.Delay(500, ct); + await Task.Delay(1500, ct); var pageUrl = Path.Combine(url, i.ToString()); var load = await ReadPageAsync(pageUrl, isRensai, imageDirectory, ct).ConfigureAwait(false); SectionWithChapterTitleList.Add(load); @@ -93,7 +92,6 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } else { - document.Chapters[^1].Sections.Add(sectionWithChapterTitle.section); } } @@ -123,7 +121,7 @@ public record BookInfo(int? allcount, int? noveltype, int? general_all_no); private record SectionWithChapterTitle(string? title, Section section); - private async Task ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct) + private static async Task ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); diff --git a/Epub/KoeBook.Epub/Utility/ScrapingHelper.cs b/Epub/KoeBook.Epub/Utility/ScrapingHelper.cs index 7b7a337..45991a9 100644 --- a/Epub/KoeBook.Epub/Utility/ScrapingHelper.cs +++ b/Epub/KoeBook.Epub/Utility/ScrapingHelper.cs @@ -4,7 +4,7 @@ public static class ScrapingHelper { public static List SplitBrace(string text) { - if (text.Length == 1 && (text == "「" || text == "」")) + if (text.Length == 1 && (text == "「" || text == "『" || text == "」" || text == "』")) return [text]; var bracket = 0; @@ -12,8 +12,8 @@ public static List SplitBrace(string text) for (var i = 0; i < text.Length; i++) { var c = text[i]; - if (c == '「') bracket++; - else if (c == '」') bracket--; + if (c == '「' || c == '『') bracket++; + else if (c == '」' || c == '』') bracket--; brackets[i] = bracket; } @@ -23,12 +23,12 @@ public static List SplitBrace(string text) for (var i = 0; i < brackets.Length; i++) { brackets[i] -= mn; - if (text[i] == '「' && brackets[i] == 1 && i != 0 && startIdx != i) + if ((text[i] == '「' || text[i] == '『') && brackets[i] == 1 && i != 0 && startIdx != i) { result.Add(text[startIdx..i]); startIdx = i; } - if (text[i] == '」' && brackets[i] == 0) + if ((text[i] == '」' || text[i] == '』') && brackets[i] == 0) { result.Add(text[startIdx..(i + 1)]); startIdx = i + 1; diff --git a/KoeBook.Test/Epub/ScrapingHelperTest.cs b/KoeBook.Test/Epub/ScrapingHelperTest.cs index 394315e..055cceb 100644 --- a/KoeBook.Test/Epub/ScrapingHelperTest.cs +++ b/KoeBook.Test/Epub/ScrapingHelperTest.cs @@ -7,6 +7,7 @@ public class ScrapingHelperTest public static object[][] TestCases() { (string, List)[] cases = [ + // '「''」'のみの場合のケース ("「", ["「"]), ("」", ["」"]), ("a", ["a"]), @@ -29,7 +30,51 @@ public static object[][] TestCases() ("abc「abc「abc」abc", ["abc", "「abc「abc」abc"]), ("abc「abc」abc」abc", ["abc「abc」abc」", "abc"]), ("abc「abc「abc", ["abc", "「abc「abc"]), - ("abc」abc」abc", ["abc」abc」", "abc"]) + ("abc」abc」abc", ["abc」abc」", "abc"]), + // '『''』'のみの場合のケース + ("『", ["『"]), + ("』", ["』"]), + ("a", ["a"]), + ("abc『abc』abc", ["abc", "『abc』", "abc"]), + ("abc『abc』", ["abc", "『abc』"]), + ("『abc』abc", ["『abc』", "abc",]), + ("abc『abc』", ["abc", "『abc』"]), + ("『abc』", ["『abc』",]), + ("abc『abc", ["abc", "『abc"]), + ("abc『", ["abc", "『"]), + ("『abc", ["『abc"]), + ("abc』abc", ["abc』", "abc"]), + ("abc』", ["abc』"]), + ("』abc", ["』", "abc"]), + ("abc『abc』abc『abc』abc", ["abc", "『abc』", "abc", "『abc』", "abc"]), + ("『abc』abc『abc』abc", ["『abc』", "abc", "『abc』", "abc"]), + ("abc『abc』『abc』abc", ["abc", "『abc』", "『abc』", "abc"]), + ("abc『abc』abc『abc』", ["abc", "『abc』", "abc", "『abc』"]), + ("abc『abc『abc』abc』abc", ["abc", "『abc『abc』abc』", "abc"]), + ("abc『abc『abc』abc", ["abc", "『abc『abc』abc"]), + ("abc『abc』abc』abc", ["abc『abc』abc』", "abc"]), + ("abc『abc『abc", ["abc", "『abc『abc"]), + ("abc』abc』abc", ["abc』abc』", "abc"]), + // '「''」''『''』'が混在するパターン + ("abc「abc」abc『abc』abc", ["abc", "「abc」", "abc", "『abc』", "abc"]), + ("abc『abc』abc「abc」abc", ["abc", "『abc』", "abc", "「abc」", "abc"]), + ("「abc」abc『abc』abc", ["「abc」", "abc", "『abc』", "abc"]), + ("『abc』abc「abc」abc", ["『abc』", "abc", "「abc」", "abc"]), + ("abc「abc」『abc』abc", ["abc", "「abc」", "『abc』", "abc"]), + ("abc『abc』「abc」abc", ["abc", "『abc』", "「abc」", "abc"]), + ("abc「abc」abc『abc』", ["abc", "「abc」", "abc", "『abc』"]), + ("abc『abc』abc「abc」", ["abc", "『abc』", "abc", "「abc」"]), + ("abc「abc『abc』abc」abc", ["abc", "「abc『abc』abc」", "abc"]), + ("abc『abc「abc」abc』abc", ["abc", "『abc「abc」abc』", "abc"]), + ("abc「abc『abc』abc", ["abc", "「abc『abc』abc"]), + ("abc『abc「abc」abc", ["abc", "『abc「abc」abc"]), + ("abc「abc」abc』abc", ["abc「abc」abc』", "abc"]), + ("abc『abc』abc」abc", ["abc『abc』abc」", "abc"]), + ("abc「abc『abc", ["abc", "「abc『abc"]), + ("abc『abc「abc", ["abc", "『abc「abc"]), + ("abc」abc』abc", ["abc」abc』", "abc"]), + ("abc』abc」abc", ["abc』abc」", "abc"]), + ("abc』abc』abc", ["abc』abc』", "abc"]) ]; return cases.Select(c => new object[] { c.Item1, c.Item2 }).ToArray(); } From 718b8fd7e784f9226c1e86713ff84f82d2549638 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Thu, 7 Mar 2024 03:55:06 +0900 Subject: [PATCH 3/5] =?UTF-8?q?=E8=91=97=E8=80=85=E5=90=8D=E3=81=8Ca?= =?UTF-8?q?=E3=82=BF=E3=82=B0=E3=81=A7=E3=81=AA=E3=81=8F=E3=81=A6=E3=82=82?= =?UTF-8?q?=E8=91=97=E8=80=85=E5=90=8D=E3=82=92=E5=8F=96=E5=BE=97=E3=81=A7?= =?UTF-8?q?=E3=81=8D=E3=82=8B=E3=82=88=E3=81=86=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Services/ScrapingNaroService.cs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs index ba33b8a..ccfd70a 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs @@ -25,12 +25,23 @@ public async ValueTask ScrapingAsync(string url, string coverFileP var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); // title の取得 - var bookTitle = doc.QuerySelector(".novel_title") + var bookTitleElement = doc.QuerySelector(".novel_title") ?? throw new EpubDocumentException($"Failed to get title properly.\nUrl may be not collect"); + var bookTitle = bookTitleElement.InnerHtml; // auther の取得 - var bookAuther = doc.QuerySelector(".novel_writername a") + var bookAutherElement = doc.QuerySelector(".novel_writername") ?? throw new EpubDocumentException($"Failed to get auther properly.\nUrl may be not collect"); + var bookAuther = string.Empty; + if (bookAutherElement.QuerySelector("a") is IHtmlAnchorElement bookAutherAnchorElement) + { + bookAuther = bookAutherAnchorElement.InnerHtml; + } + else + { + bookAuther = bookAutherElement.InnerHtml.Replace("作者:", ""); + } + bool isRensai = true; int allNum = 0; @@ -65,7 +76,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP throw new EpubDocumentException("faild to get data by Narou API"); } - var document = new EpubDocument(bookTitle.InnerHtml, bookAuther.InnerHtml, coverFilePath, id); + var document = new EpubDocument(bookTitle, bookAuther, coverFilePath, id); if (isRensai) // 連載の時 { List SectionWithChapterTitleList = new List(); From f91fbf6ca0f7bad306240ea091726ba3203aed28 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Fri, 8 Mar 2024 03:43:29 +0900 Subject: [PATCH 4/5] =?UTF-8?q?=E4=B8=8D=E8=A6=81=E3=81=AA=E8=A1=8C(?= =?UTF-8?q?=E5=87=A6=E7=90=86)=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/Services/ScrapingNaroService.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs index ccfd70a..6e93cbe 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs @@ -161,11 +161,10 @@ private static async Task ReadPageAsync(string url, boo sectionTitleElement = doc.QuerySelector(".novel_title"); } - string sectionTitle = ""; if (sectionTitleElement == null) throw new EpubDocumentException("Can not find title of page"); - sectionTitle = sectionTitleElement.InnerHtml; + var sectionTitle = sectionTitleElement.InnerHtml; var section = new Section(sectionTitleElement.InnerHtml); @@ -236,6 +235,7 @@ private static async Task ReadPageAsync(string url, boo if (tags.TagName != "RUBY") { isAllRuby = false; + break; } } From f7135a4b012c485de4db52b938c1350d209bcaf6 Mon Sep 17 00:00:00 2001 From: TakenPt Date: Fri, 8 Mar 2024 23:40:46 +0900 Subject: [PATCH 5/5] =?UTF-8?q?=E3=83=95=E3=82=A9=E3=83=BC=E3=83=9E?= =?UTF-8?q?=E3=83=83=E3=83=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Epub/KoeBook.Epub/Services/ScrapingNaroService.cs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs index 6e93cbe..576c422 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs @@ -40,8 +40,8 @@ public async ValueTask ScrapingAsync(string url, string coverFileP else { bookAuther = bookAutherElement.InnerHtml.Replace("作者:", ""); - } - + } + bool isRensai = true; int allNum = 0; @@ -254,8 +254,6 @@ private static async Task ReadPageAsync(string url, boo return new SectionWithChapterTitle(chapterTitle, section); } - - [System.Text.RegularExpressions.GeneratedRegex(@"https://.{5,7}.syosetu.com/(.{7}).?")] private static partial System.Text.RegularExpressions.Regex UrlToNcode();