diff --git a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs index a1e00ec..576c422 100644 --- a/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs +++ b/Epub/KoeBook.Epub/Services/ScrapingNaroService.cs @@ -25,12 +25,23 @@ public async ValueTask ScrapingAsync(string url, string coverFileP var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); // title の取得 - var bookTitle = doc.QuerySelector(".novel_title") + var bookTitleElement = doc.QuerySelector(".novel_title") ?? throw new EpubDocumentException($"Failed to get title properly.\nUrl may be not collect"); + var bookTitle = bookTitleElement.InnerHtml; // auther の取得 - var bookAuther = doc.QuerySelector(".novel_writername a") + var bookAutherElement = doc.QuerySelector(".novel_writername") ?? throw new EpubDocumentException($"Failed to get auther properly.\nUrl may be not collect"); + var bookAuther = string.Empty; + if (bookAutherElement.QuerySelector("a") is IHtmlAnchorElement bookAutherAnchorElement) + { + bookAuther = bookAutherAnchorElement.InnerHtml; + } + else + { + bookAuther = bookAutherElement.InnerHtml.Replace("作者:", ""); + } + bool isRensai = true; int allNum = 0; @@ -65,14 +76,13 @@ public async ValueTask ScrapingAsync(string url, string coverFileP throw new EpubDocumentException("faild to get data by Narou API"); } - var document = new EpubDocument(bookTitle.InnerHtml, bookAuther.InnerHtml, coverFilePath, id); - if (isRensai) + var document = new EpubDocument(bookTitle, bookAuther, coverFilePath, id); + if (isRensai) // 連載の時 { List SectionWithChapterTitleList = new List(); for (int i = 1; i <= allNum; i++) { - Console.WriteLine(i); - await Task.Delay(500, ct); + await Task.Delay(1500, ct); var pageUrl = Path.Combine(url, i.ToString()); var load = await ReadPageAsync(pageUrl, isRensai, imageDirectory, ct).ConfigureAwait(false); SectionWithChapterTitleList.Add(load); @@ -106,7 +116,7 @@ public async ValueTask ScrapingAsync(string url, string coverFileP } } } - else + else // 短編の時 { var load = await ReadPageAsync(url, isRensai, imageDirectory, ct).ConfigureAwait(false); if (load != null) @@ -122,19 +132,22 @@ public record BookInfo(int? allcount, int? noveltype, int? general_all_no); private record SectionWithChapterTitle(string? title, Section section); - private async Task ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct) + private static async Task ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct) { var config = Configuration.Default.WithDefaultLoader(); using var context = BrowsingContext.New(config); var doc = await context.OpenAsync(url, ct).ConfigureAwait(false); - var chapterTitleElement = doc.QuerySelector(".chapter_title"); string? chapterTitle = null; - if (chapterTitleElement != null) + if (!isRensai) { - if (chapterTitleElement.InnerHtml != null) + var chapterTitleElement = doc.QuerySelector(".chapter_title"); + if (chapterTitleElement != null) { - chapterTitle = chapterTitleElement.InnerHtml; + if (chapterTitleElement.InnerHtml != null) + { + chapterTitle = chapterTitleElement.InnerHtml; + } } } @@ -148,11 +161,10 @@ private async Task ReadPageAsync(string url, bool isRen sectionTitleElement = doc.QuerySelector(".novel_title"); } - string sectionTitle = ""; if (sectionTitleElement == null) throw new EpubDocumentException("Can not find title of page"); - sectionTitle = sectionTitleElement.InnerHtml; + var sectionTitle = sectionTitleElement.InnerHtml; var section = new Section(sectionTitleElement.InnerHtml); @@ -223,6 +235,7 @@ private async Task ReadPageAsync(string url, bool isRen if (tags.TagName != "RUBY") { isAllRuby = false; + break; } } @@ -241,8 +254,6 @@ private async Task ReadPageAsync(string url, bool isRen return new SectionWithChapterTitle(chapterTitle, section); } - - [System.Text.RegularExpressions.GeneratedRegex(@"https://.{5,7}.syosetu.com/(.{7}).?")] private static partial System.Text.RegularExpressions.Regex UrlToNcode(); diff --git a/Epub/KoeBook.Epub/Utility/ScrapingHelper.cs b/Epub/KoeBook.Epub/Utility/ScrapingHelper.cs index 7b7a337..45991a9 100644 --- a/Epub/KoeBook.Epub/Utility/ScrapingHelper.cs +++ b/Epub/KoeBook.Epub/Utility/ScrapingHelper.cs @@ -4,7 +4,7 @@ public static class ScrapingHelper { public static List SplitBrace(string text) { - if (text.Length == 1 && (text == "「" || text == "」")) + if (text.Length == 1 && (text == "「" || text == "『" || text == "」" || text == "』")) return [text]; var bracket = 0; @@ -12,8 +12,8 @@ public static List SplitBrace(string text) for (var i = 0; i < text.Length; i++) { var c = text[i]; - if (c == '「') bracket++; - else if (c == '」') bracket--; + if (c == '「' || c == '『') bracket++; + else if (c == '」' || c == '』') bracket--; brackets[i] = bracket; } @@ -23,12 +23,12 @@ public static List SplitBrace(string text) for (var i = 0; i < brackets.Length; i++) { brackets[i] -= mn; - if (text[i] == '「' && brackets[i] == 1 && i != 0 && startIdx != i) + if ((text[i] == '「' || text[i] == '『') && brackets[i] == 1 && i != 0 && startIdx != i) { result.Add(text[startIdx..i]); startIdx = i; } - if (text[i] == '」' && brackets[i] == 0) + if ((text[i] == '」' || text[i] == '』') && brackets[i] == 0) { result.Add(text[startIdx..(i + 1)]); startIdx = i + 1; diff --git a/KoeBook.Test/Epub/ScrapingHelperTest.cs b/KoeBook.Test/Epub/ScrapingHelperTest.cs index 394315e..055cceb 100644 --- a/KoeBook.Test/Epub/ScrapingHelperTest.cs +++ b/KoeBook.Test/Epub/ScrapingHelperTest.cs @@ -7,6 +7,7 @@ public class ScrapingHelperTest public static object[][] TestCases() { (string, List)[] cases = [ + // '「''」'のみの場合のケース ("「", ["「"]), ("」", ["」"]), ("a", ["a"]), @@ -29,7 +30,51 @@ public static object[][] TestCases() ("abc「abc「abc」abc", ["abc", "「abc「abc」abc"]), ("abc「abc」abc」abc", ["abc「abc」abc」", "abc"]), ("abc「abc「abc", ["abc", "「abc「abc"]), - ("abc」abc」abc", ["abc」abc」", "abc"]) + ("abc」abc」abc", ["abc」abc」", "abc"]), + // '『''』'のみの場合のケース + ("『", ["『"]), + ("』", ["』"]), + ("a", ["a"]), + ("abc『abc』abc", ["abc", "『abc』", "abc"]), + ("abc『abc』", ["abc", "『abc』"]), + ("『abc』abc", ["『abc』", "abc",]), + ("abc『abc』", ["abc", "『abc』"]), + ("『abc』", ["『abc』",]), + ("abc『abc", ["abc", "『abc"]), + ("abc『", ["abc", "『"]), + ("『abc", ["『abc"]), + ("abc』abc", ["abc』", "abc"]), + ("abc』", ["abc』"]), + ("』abc", ["』", "abc"]), + ("abc『abc』abc『abc』abc", ["abc", "『abc』", "abc", "『abc』", "abc"]), + ("『abc』abc『abc』abc", ["『abc』", "abc", "『abc』", "abc"]), + ("abc『abc』『abc』abc", ["abc", "『abc』", "『abc』", "abc"]), + ("abc『abc』abc『abc』", ["abc", "『abc』", "abc", "『abc』"]), + ("abc『abc『abc』abc』abc", ["abc", "『abc『abc』abc』", "abc"]), + ("abc『abc『abc』abc", ["abc", "『abc『abc』abc"]), + ("abc『abc』abc』abc", ["abc『abc』abc』", "abc"]), + ("abc『abc『abc", ["abc", "『abc『abc"]), + ("abc』abc』abc", ["abc』abc』", "abc"]), + // '「''」''『''』'が混在するパターン + ("abc「abc」abc『abc』abc", ["abc", "「abc」", "abc", "『abc』", "abc"]), + ("abc『abc』abc「abc」abc", ["abc", "『abc』", "abc", "「abc」", "abc"]), + ("「abc」abc『abc』abc", ["「abc」", "abc", "『abc』", "abc"]), + ("『abc』abc「abc」abc", ["『abc』", "abc", "「abc」", "abc"]), + ("abc「abc」『abc』abc", ["abc", "「abc」", "『abc』", "abc"]), + ("abc『abc』「abc」abc", ["abc", "『abc』", "「abc」", "abc"]), + ("abc「abc」abc『abc』", ["abc", "「abc」", "abc", "『abc』"]), + ("abc『abc』abc「abc」", ["abc", "『abc』", "abc", "「abc」"]), + ("abc「abc『abc』abc」abc", ["abc", "「abc『abc』abc」", "abc"]), + ("abc『abc「abc」abc』abc", ["abc", "『abc「abc」abc』", "abc"]), + ("abc「abc『abc』abc", ["abc", "「abc『abc』abc"]), + ("abc『abc「abc」abc", ["abc", "『abc「abc」abc"]), + ("abc「abc」abc』abc", ["abc「abc」abc』", "abc"]), + ("abc『abc』abc」abc", ["abc『abc』abc」", "abc"]), + ("abc「abc『abc", ["abc", "「abc『abc"]), + ("abc『abc「abc", ["abc", "『abc「abc"]), + ("abc」abc』abc", ["abc」abc』", "abc"]), + ("abc』abc」abc", ["abc』abc」", "abc"]), + ("abc』abc』abc", ["abc』abc』", "abc"]) ]; return cases.Select(c => new object[] { c.Item1, c.Item2 }).ToArray(); }