From 3730a32bd8cb9ddf8c86f5f52797a0b8dc682b1d Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Fri, 22 Dec 2023 09:06:26 +0900 Subject: [PATCH] use same impl for splitting text into paragraphs --- .../lib/runners/DeduplicateParagraphs.scala | 26 ++++++++++--------- .../lib/filters/LargeFreqParagraphsSpec.scala | 19 ++++++++++++++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala index 162c887..52896bf 100644 --- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala +++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala @@ -2,6 +2,7 @@ package com.worksap.nlp.uzushio.lib.runners import com.typesafe.config.ConfigFactory import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph, Pipeline} +import com.worksap.nlp.uzushio.lib.runners.DeduplicateParagraphs.{cleanParagraphUdf, splitTextToParagraphs} import com.worksap.nlp.uzushio.lib.runners.DuplicateCandidateRow._ import com.worksap.nlp.uzushio.lib.stats.{NgramBitSignatures, NgramHashExtractor, SimHashProcessor} import com.worksap.nlp.uzushio.lib.utils.Resources.AutoClosableResource @@ -372,7 +373,7 @@ class DeduplicateParagraphs( import com.worksap.nlp.uzushio.lib.utils.BuilderSyntax._ val rawData = spark.read.parquet(args.inputs: _*) - val basicData = prepareBasicData(rawData) + val basicData = prepareDataForDedup(rawData) val reprParagraphs = if (args.hasStage("reprHashes")) { @@ -422,14 +423,10 @@ class DeduplicateParagraphs( .option("compression", args.compression).save(args.output) } - private def prepareBasicData(rawData: DataFrame): DataFrame = { - val cleanParagraphs = udf((x: String) => Paragraphs.extractCleanParagraphs(x)) - - val splitDocs = rawData.select( - posexplode(cleanParagraphs(rawData.col("text"))).as(Seq("pos", "text")) - ) - - prepareDataset(splitDocs) + private def prepareDataForDedup(rawData: DataFrame): DataFrame = { + val exploded = splitTextToParagraphs(rawData) + val noMetadata = exploded.withColumn("text", cleanParagraphUdf(exploded.col("text"))) + prepareDataset(noMetadata) } def prepareDataset(ds: DataFrame): DataFrame = { @@ -845,7 +842,9 @@ object DeduplicateParagraphs { ) } - private def hashParagraphs(raw: DataFrame) = { + private val cleanParagraphUdf = udf((s: String) => Paragraphs.extractCleanParagraph(s)) + + private def splitTextToParagraphs(raw: DataFrame) = { val explodeCols = raw.columns.map { case "text" => posexplode(split(raw.col("text"), "\n\n")).as(Seq("pos", "text")) case col => raw.col(col) @@ -855,9 +854,12 @@ object DeduplicateParagraphs { octet_length(raw.col("text")) < 2 * 1024 * 1024 && countParagraphs(raw.col("text")) < 1000 ).select(explodeCols: _*) - val cleanParUdf = udf((s: String) => Paragraphs.extractCleanParagraph(s)) + exploded + } - exploded.withColumn("parHash", xxhash64(cleanParUdf(exploded.col("text")))) + private def hashParagraphs(raw: DataFrame) = { + val exploded = splitTextToParagraphs(raw) + exploded.withColumn("parHash", xxhash64(cleanParagraphUdf(exploded.col("text")))) } def collectDocParts( diff --git a/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/LargeFreqParagraphsSpec.scala b/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/LargeFreqParagraphsSpec.scala index cc7a5a6..dc43706 100644 --- a/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/LargeFreqParagraphsSpec.scala +++ b/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/LargeFreqParagraphsSpec.scala @@ -6,6 +6,25 @@ import org.scalatest.freespec.AnyFreeSpec class LargeFreqParagraphsSpec extends AnyFreeSpec { "LargeFreqParagraphs" - { val filter = new LargeFreqParagraphs(freq = 10) + + "deletes all paragraphs" in { + val doc = Document( + Paragraph("p", "test1", nearFreq = 15), + Paragraph("p", "test2", nearFreq = 30), + Paragraph("p", "test3", nearFreq = 30), + ) + val filtered = filter.checkDocument(doc) + assert(filtered.paragraphs(0).remove eq filter) + assert(filtered.paragraphs(1).remove eq filter) + assert(filtered.paragraphs(2).remove eq filter) + + assert(filtered.countDroppedParagraphs() == 3) + val docs = filtered.splitByFilteredParagraphs() + assert(docs.length == 1) + val firstDoc = docs.head + assert(firstDoc.remove eq filter) + } + "works with prefix (3)" in { val doc = Document( Paragraph("p", "test1", nearFreq = 15),