diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala index 52896bf..5346df2 100644 --- a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala +++ b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala @@ -820,7 +820,7 @@ object DeduplicateParagraphs { val basicCols = (if (debug) { joined.columns.filter { - case "parHash" => false + case "parHash" => true case "exactFreq" | "nearFreq" => false case _ => true } @@ -901,7 +901,7 @@ object DeduplicateParagraphs { case class ProcessedDocument(text: String, filter: String) - private def processDocumentParts( + private def processDocumentParts( args: Args, docId: String, parts: IndexedSeq[Paragraph] diff --git a/scripts/submit_all_compute_stats.sh b/scripts/submit_all_compute_stats.sh new file mode 100644 index 0000000..1a77b0d --- /dev/null +++ b/scripts/submit_all_compute_stats.sh @@ -0,0 +1,70 @@ +submit_post2017() { + qsub -g gcf51199 -l rt_F=10 -l h_rt=4:00:00 submit_dedup_stage1.sh \ + "/groups/gcf51199/cc/extracted/segment\=$1" \ + /groups/gcf51199/cc/stats_raw_v2/segment=$1 \ + 500 4000 +} + +submit_post2017 CC-MAIN-2017-04 +submit_post2017 CC-MAIN-2017-09 +submit_post2017 CC-MAIN-2017-13 +submit_post2017 CC-MAIN-2017-17 +submit_post2017 CC-MAIN-2017-22 +submit_post2017 CC-MAIN-2017-26 +submit_post2017 CC-MAIN-2017-30 +submit_post2017 CC-MAIN-2017-34 +submit_post2017 CC-MAIN-2017-39 +submit_post2017 CC-MAIN-2017-43 +submit_post2017 CC-MAIN-2017-47 +submit_post2017 CC-MAIN-2017-51 +submit_post2017 CC-MAIN-2018-05 +submit_post2017 CC-MAIN-2018-09 +submit_post2017 CC-MAIN-2018-13 +submit_post2017 CC-MAIN-2018-17 +submit_post2017 CC-MAIN-2018-22 +submit_post2017 CC-MAIN-2018-26 +submit_post2017 CC-MAIN-2018-30 +submit_post2017 CC-MAIN-2018-34 +submit_post2017 CC-MAIN-2018-39 +submit_post2017 CC-MAIN-2018-43 +submit_post2017 CC-MAIN-2018-47 +submit_post2017 CC-MAIN-2018-51 +submit_post2017 CC-MAIN-2019-04 +submit_post2017 CC-MAIN-2019-09 +submit_post2017 CC-MAIN-2019-13 +submit_post2017 CC-MAIN-2019-18 +submit_post2017 CC-MAIN-2019-22 +submit_post2017 CC-MAIN-2019-26 +submit_post2017 CC-MAIN-2019-30 +submit_post2017 CC-MAIN-2019-35 +submit_post2017 CC-MAIN-2019-39 +submit_post2017 CC-MAIN-2019-43 +submit_post2017 CC-MAIN-2019-47 +submit_post2017 CC-MAIN-2019-51 +submit_post2017 CC-MAIN-2020-05 +submit_post2017 CC-MAIN-2020-10 +submit_post2017 CC-MAIN-2020-16 +submit_post2017 CC-MAIN-2020-24 +submit_post2017 CC-MAIN-2020-29 +submit_post2017 CC-MAIN-2020-34 +submit_post2017 CC-MAIN-2020-40 +submit_post2017 CC-MAIN-2020-45 +submit_post2017 CC-MAIN-2020-50 +submit_post2017 CC-MAIN-2021-04 +submit_post2017 CC-MAIN-2021-10 +submit_post2017 CC-MAIN-2021-17 +submit_post2017 CC-MAIN-2021-21 +submit_post2017 CC-MAIN-2021-25 +submit_post2017 CC-MAIN-2021-31 +submit_post2017 CC-MAIN-2021-39 +submit_post2017 CC-MAIN-2021-43 +submit_post2017 CC-MAIN-2021-49 +submit_post2017 CC-MAIN-2022-05 +submit_post2017 CC-MAIN-2022-21 +submit_post2017 CC-MAIN-2022-27 +submit_post2017 CC-MAIN-2022-33 +submit_post2017 CC-MAIN-2022-40 +submit_post2017 CC-MAIN-2022-49 +submit_post2017 CC-MAIN-2023-06 +submit_post2017 CC-MAIN-2023-14 +submit_post2017 CC-MAIN-2023-23 \ No newline at end of file diff --git a/scripts/submit_all_compute_stats_old.sh b/scripts/submit_all_compute_stats_old.sh new file mode 100644 index 0000000..14386f6 --- /dev/null +++ b/scripts/submit_all_compute_stats_old.sh @@ -0,0 +1,11 @@ +submit_pre2016() { + qsub -g gcf51199 -l rt_F=10 -l h_rt=4:00:00 submit_dedup_stage1.sh \ + "/groups/gcf51199/cc2/extracted/$1" \ + "/groups/gcf51199/cc/stats_raw_v2/segment=$1" \ + 500 4000 +} + +submit_pre2016 merged-2013 +submit_pre2016 merged-2014 +submit_pre2016 merged-2015 +submit_pre2016 merged-2016 \ No newline at end of file diff --git a/scripts/submit_all_filter.sh b/scripts/submit_all_filter.sh index 5f6a246..288ccfa 100644 --- a/scripts/submit_all_filter.sh +++ b/scripts/submit_all_filter.sh @@ -4,7 +4,7 @@ submit() { qsub -g gcf51199 -l rt_F=10 -l h_rt=1:00:00 submit_filter_debug_2.sh \ "/groups/gcf51199/cc/extracted/segment\=$1" \ /groups/gcf51199/cc/stats_merged_v1/for_filter/all \ - "/groups/gcf51199/cc/filtered_v1/segment=$1" + "/groups/gcf51199/cc/filtered_v2/segment=$1" } submit CC-MAIN-2017-04 @@ -23,7 +23,7 @@ submit CC-MAIN-2018-05 submit CC-MAIN-2018-09 submit CC-MAIN-2018-13 submit CC-MAIN-2018-17 -# submit CC-MAIN-2018-22 +submit CC-MAIN-2018-22 submit CC-MAIN-2018-26 submit CC-MAIN-2018-30 submit CC-MAIN-2018-34 diff --git a/scripts/submit_dedup_stage1.sh b/scripts/submit_dedup_stage1.sh index 0f9eafb..20794de 100644 --- a/scripts/submit_dedup_stage1.sh +++ b/scripts/submit_dedup_stage1.sh @@ -3,6 +3,7 @@ #$ -j y #$ -cwd #$ -l USE_SSH=1 +#$ -l USE_EXTRA_NETWORK=1 du -hs "$1" > /dev/null & @@ -38,6 +39,7 @@ NUM_PARTITIONS_PROPAGATION=${4:-$(($NUM_PARTITIONS * 4))} --conf spark.eventLog.dir=/scratch/$USER/spark-exlog \ --conf spark.local.dir=$SPARK_LOCAL_DIRS \ --conf spark.sql.shuffle.partitions=${NUM_PARTITIONS_PROPAGATION} \ + --conf spark.sql.parquet.columnarReaderBatchSize=512 \ local://$UZUSHIO_JAR \ --input="$INPUT" \ --output="$OUTPUT" \ diff --git a/scripts/submit_filter_debug_2.sh b/scripts/submit_filter_debug_2.sh index 04bdb69..44b9da8 100644 --- a/scripts/submit_filter_debug_2.sh +++ b/scripts/submit_filter_debug_2.sh @@ -50,7 +50,7 @@ NUM_PARTITIONS_PROPAGATION=4000 --filters=$SCRIPT_DIR/pipeline_02.conf \ --partitions=$NUM_PARTITIONS \ --execution=filter-debug \ - -Pkenlm=/groups/gcf51199/filter/n-gram_model/kenlm_model_sudachi_filter.bin \ + -Pkenlm=/groups/gcf51199/filter/n-gram_model/kenlm_merge-code_0.05_model.bin \ -Psudachi=/groups/gcf51199/resources/sudachi-dictionary-20230927/system_core.dic \ --format=json --compression=gzip --text-only