Skip to content

Commit

Permalink
use same impl for splitting text into paragraphs
Browse files Browse the repository at this point in the history
  • Loading branch information
eiennohito committed Dec 22, 2023
1 parent 839af2e commit 3730a32
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package com.worksap.nlp.uzushio.lib.runners

import com.typesafe.config.ConfigFactory
import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph, Pipeline}
import com.worksap.nlp.uzushio.lib.runners.DeduplicateParagraphs.{cleanParagraphUdf, splitTextToParagraphs}
import com.worksap.nlp.uzushio.lib.runners.DuplicateCandidateRow._
import com.worksap.nlp.uzushio.lib.stats.{NgramBitSignatures, NgramHashExtractor, SimHashProcessor}
import com.worksap.nlp.uzushio.lib.utils.Resources.AutoClosableResource
Expand Down Expand Up @@ -372,7 +373,7 @@ class DeduplicateParagraphs(
import com.worksap.nlp.uzushio.lib.utils.BuilderSyntax._
val rawData = spark.read.parquet(args.inputs: _*)

val basicData = prepareBasicData(rawData)
val basicData = prepareDataForDedup(rawData)

val reprParagraphs =
if (args.hasStage("reprHashes")) {
Expand Down Expand Up @@ -422,14 +423,10 @@ class DeduplicateParagraphs(
.option("compression", args.compression).save(args.output)
}

private def prepareBasicData(rawData: DataFrame): DataFrame = {
val cleanParagraphs = udf((x: String) => Paragraphs.extractCleanParagraphs(x))

val splitDocs = rawData.select(
posexplode(cleanParagraphs(rawData.col("text"))).as(Seq("pos", "text"))
)

prepareDataset(splitDocs)
private def prepareDataForDedup(rawData: DataFrame): DataFrame = {
val exploded = splitTextToParagraphs(rawData)
val noMetadata = exploded.withColumn("text", cleanParagraphUdf(exploded.col("text")))
prepareDataset(noMetadata)
}

def prepareDataset(ds: DataFrame): DataFrame = {
Expand Down Expand Up @@ -845,7 +842,9 @@ object DeduplicateParagraphs {
)
}

private def hashParagraphs(raw: DataFrame) = {
private val cleanParagraphUdf = udf((s: String) => Paragraphs.extractCleanParagraph(s))

private def splitTextToParagraphs(raw: DataFrame) = {
val explodeCols = raw.columns.map {
case "text" => posexplode(split(raw.col("text"), "\n\n")).as(Seq("pos", "text"))
case col => raw.col(col)
Expand All @@ -855,9 +854,12 @@ object DeduplicateParagraphs {
octet_length(raw.col("text")) < 2 * 1024 * 1024 && countParagraphs(raw.col("text")) < 1000
).select(explodeCols: _*)

val cleanParUdf = udf((s: String) => Paragraphs.extractCleanParagraph(s))
exploded
}

exploded.withColumn("parHash", xxhash64(cleanParUdf(exploded.col("text"))))
private def hashParagraphs(raw: DataFrame) = {
val exploded = splitTextToParagraphs(raw)
exploded.withColumn("parHash", xxhash64(cleanParagraphUdf(exploded.col("text"))))
}

def collectDocParts(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,25 @@ import org.scalatest.freespec.AnyFreeSpec
class LargeFreqParagraphsSpec extends AnyFreeSpec {
"LargeFreqParagraphs" - {
val filter = new LargeFreqParagraphs(freq = 10)

"deletes all paragraphs" in {
val doc = Document(
Paragraph("p", "test1", nearFreq = 15),
Paragraph("p", "test2", nearFreq = 30),
Paragraph("p", "test3", nearFreq = 30),
)
val filtered = filter.checkDocument(doc)
assert(filtered.paragraphs(0).remove eq filter)
assert(filtered.paragraphs(1).remove eq filter)
assert(filtered.paragraphs(2).remove eq filter)

assert(filtered.countDroppedParagraphs() == 3)
val docs = filtered.splitByFilteredParagraphs()
assert(docs.length == 1)
val firstDoc = docs.head
assert(firstDoc.remove eq filter)
}

"works with prefix (3)" in {
val doc = Document(
Paragraph("p", "test1", nearFreq = 15),
Expand Down

0 comments on commit 3730a32

Please sign in to comment.