-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' of ssh://github.com/WorksApplications/uzushio int…
…o large-freq-tag-paragpraph-filter
- Loading branch information
Showing
11 changed files
with
385 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
102 changes: 102 additions & 0 deletions
102
lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/KenLMDocAvgPerplexity.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
package com.worksap.nlp.uzushio.lib.filters | ||
|
||
import com.github.jbaiter.kenlm.BufferEvaluator | ||
import com.worksap.nlp.sudachi.{Dictionary, Morpheme} | ||
import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} | ||
import com.worksap.nlp.uzushio.lib.filters.base.{DocFilter, HighLowDocFilter} | ||
import com.worksap.nlp.uzushio.lib.resources.{KenLM, Sudachi} | ||
import com.worksap.nlp.uzushio.lib.utils.Paragraphs | ||
|
||
class KenLMDocAvgPerplexity( | ||
sudachi: String, | ||
kenlm: String, | ||
outliers: Float = 0, | ||
override val high: Float = 1e6f, | ||
override val low: Float = 0f | ||
) extends HighLowDocFilter { | ||
|
||
@transient | ||
private lazy val processor = KenLMEvaluator.make(sudachi, kenlm, outliers) | ||
|
||
override def checkDocument(doc: Document): Document = { | ||
val perplexity = measureDoc(doc) | ||
maybeFilter(doc, perplexity) | ||
} | ||
|
||
def measureDoc(doc: Document): Float = { | ||
var ppxSum = 0.0 | ||
var charCnt = 0 | ||
val paragraphs = doc.aliveParagraphs | ||
val proc = processor | ||
while (paragraphs.hasNext) { | ||
val p = paragraphs.next() | ||
val logProb = proc.scoreParagraph(p) | ||
val ppx = Math.pow(10, -logProb) | ||
ppxSum += ppx * p.text.length | ||
charCnt += p.text.length | ||
} | ||
(ppxSum / charCnt).toFloat | ||
} | ||
|
||
override def describeFilter: String = s"KenLMAvgDoc($outliers)" | ||
} | ||
|
||
class KenLMEvaluator(sudachi: String, kenlm: String) { | ||
private val dictionary: Dictionary = Sudachi.get(sudachi) | ||
final protected val tokenizer = dictionary.create() | ||
final protected val evaluator = KenLM.get(kenlm).bufferEvaluator(64 * 1024, 1024) | ||
|
||
def processParagraph(p: Paragraph): BufferEvaluator = { | ||
val tokens = tokenizer.tokenize(p.text) | ||
val ev = evaluator | ||
val iter = tokens.iterator() | ||
var continue = true | ||
ev.clear() | ||
while (iter.hasNext && continue) { | ||
val token = iter.next() | ||
if (acceptedToken(token)) { | ||
val remaining = ev.append(token.surface()) | ||
continue = remaining > 0 | ||
} | ||
} | ||
ev | ||
} | ||
|
||
def acceptedToken(x: Morpheme): Boolean = { | ||
if (x.normalizedForm() == " ") { | ||
return false | ||
} | ||
|
||
val s = x.surface() | ||
if (s.length == 1) { | ||
s.charAt(0) match { | ||
case Paragraphs.HTML_LINK_START | Paragraphs.HTML_LINK_END | '\n' => return false | ||
case _ => return true | ||
} | ||
} | ||
|
||
true | ||
} | ||
|
||
def extractScore(ev: BufferEvaluator): Double = ev.evaluate() | ||
|
||
def scoreParagraph(p: Paragraph): Double = { | ||
val e = processParagraph(p) | ||
extractScore(e) | ||
} | ||
} | ||
|
||
object KenLMEvaluator { | ||
def make(sudachi: String, kenlm: String, ratio: Float): KenLMEvaluator = { | ||
if (ratio < 1e-3) { | ||
new KenLMEvaluator(sudachi, kenlm) | ||
} else { | ||
new KenLMEvaluatorNoOutliers(sudachi, kenlm, ratio) | ||
} | ||
} | ||
} | ||
|
||
class KenLMEvaluatorNoOutliers(sudachi: String, kenlm: String, ratio: Float) | ||
extends KenLMEvaluator(sudachi, kenlm) { | ||
override def extractScore(ev: BufferEvaluator): Double = ev.evaluateNoOutliers(ratio) | ||
} |
106 changes: 106 additions & 0 deletions
106
lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/KenLMParagraphPerplexity.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
package com.worksap.nlp.uzushio.lib.filters | ||
|
||
import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} | ||
import com.worksap.nlp.uzushio.lib.filters.base.DocFilter | ||
|
||
import scala.collection.mutable | ||
|
||
final case class ParagraphWithPerplexity(p: Paragraph, ppx: Float) { | ||
def isAlive: Boolean = p.isAlive | ||
|
||
def remove(x: AnyRef): ParagraphWithPerplexity = copy(p = p.copy(remove = x)) | ||
} | ||
|
||
class KenLMParagraphPerplexity( | ||
sudachi: String, | ||
kenlm: String, | ||
outliers: Float = 0.02f, | ||
count: Int = 3, | ||
threshold: Float = 1e6f | ||
) extends DocFilter { | ||
private val lmScore = -Math.log10(threshold).toFloat | ||
|
||
@transient | ||
private lazy val processor = KenLMEvaluator.make(sudachi, kenlm, outliers) | ||
|
||
override def checkDocument(doc: Document): Document = { | ||
val proc = processor | ||
val paragraphs = doc.paragraphs | ||
.map(p => ParagraphWithPerplexity(p, proc.scoreParagraph(p).toFloat)).toBuffer | ||
|
||
val nchanged = markParagraphs(paragraphs) | ||
|
||
if (nchanged > 0) { | ||
doc.copy(paragraphs = paragraphs.map(_.p)) | ||
} else { | ||
doc | ||
} | ||
} | ||
|
||
def markParagraphs(paragraphs: mutable.Buffer[ParagraphWithPerplexity]): Int = { | ||
var nchanged = 0 | ||
var idx = 0 | ||
val len = paragraphs.length | ||
while (idx < len) { | ||
val p = paragraphs(idx) | ||
if (p.isAlive && (shouldRemoveBack(paragraphs, idx) || shouldRemoveFwd(paragraphs, idx, len))) { | ||
paragraphs(idx) = p.remove(this) | ||
nchanged += removePrev(paragraphs, idx) | ||
nchanged += 1 | ||
} | ||
idx += 1 | ||
} | ||
nchanged | ||
} | ||
|
||
def removePrev(paragraphs: mutable.Buffer[ParagraphWithPerplexity], offset: Int): Int = { | ||
var result = 0 | ||
val end = math.max(offset - count, 0) | ||
var idx = offset - 1 | ||
while (idx >= end) { | ||
val p = paragraphs(idx) | ||
if (p.isAlive && p.ppx <= lmScore) { | ||
paragraphs(idx) = p.remove(this) | ||
result += 1 | ||
} | ||
|
||
idx -= 1 | ||
} | ||
result | ||
} | ||
|
||
def shouldRemoveBack( | ||
paragraphs: mutable.Buffer[ParagraphWithPerplexity], | ||
offset: Int | ||
): Boolean = { | ||
var idx = offset | ||
val end = math.max(offset - count + 1, 0) | ||
while (idx >= end) { | ||
val p = paragraphs(idx) | ||
if (p.ppx > lmScore) { | ||
return false | ||
} | ||
idx -= 1 | ||
} | ||
true | ||
} | ||
|
||
def shouldRemoveFwd( | ||
paragraphs: mutable.Buffer[ParagraphWithPerplexity], | ||
offset: Int, | ||
length: Int | ||
): Boolean = { | ||
var idx = offset | ||
val end = math.min(offset + count, length) | ||
while (idx < end) { | ||
val p = paragraphs(idx) | ||
if (p.ppx > lmScore) { | ||
return false | ||
} | ||
idx += 1 | ||
} | ||
true | ||
} | ||
|
||
override val toString = s"KenLMPar($outliers,$count,$threshold)" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
47 changes: 47 additions & 0 deletions
47
lib/src/main/scala/com/worksap/nlp/uzushio/lib/resources/CachedLocalResource.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
package com.worksap.nlp.uzushio.lib.resources | ||
|
||
import com.github.jbaiter.kenlm.Model | ||
import com.worksap.nlp.sudachi.{Config, Dictionary, DictionaryFactory} | ||
import org.apache.spark.SparkFiles | ||
|
||
import java.nio.file.{Files, Path, Paths} | ||
import java.util.concurrent.ConcurrentHashMap | ||
|
||
trait CachedLocalResource[T] { | ||
final private val cache = new ConcurrentHashMap[Path, T]() | ||
|
||
def create(p: Path): T | ||
|
||
def get(dict: String): T = { | ||
val p = resolveLocalPath(dict).orElse(resolveSparkPath(dict)).getOrElse( | ||
throw new IllegalArgumentException(s"could not find file: $dict") | ||
) | ||
|
||
cache.computeIfAbsent( | ||
p, | ||
p1 => create(p1) | ||
) | ||
} | ||
|
||
def resolveLocalPath(str: String): Option[Path] = { | ||
val p = Paths.get(str) | ||
if (Files.exists(p) && Files.isRegularFile(p)) { | ||
Some(p) | ||
} else None | ||
} | ||
|
||
def resolveSparkPath(str: String): Option[Path] = { | ||
resolveLocalPath(SparkFiles.get(str)) | ||
} | ||
} | ||
|
||
object Sudachi extends CachedLocalResource[Dictionary] { | ||
override def create(p: Path): Dictionary = { | ||
val cfg = Config.defaultConfig().systemDictionary(p) | ||
new DictionaryFactory().create(cfg) | ||
} | ||
} | ||
|
||
object KenLM extends CachedLocalResource[Model] { | ||
override def create(p: Path): Model = new Model(p) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.