Skip to content

Commit

Permalink
update scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
eiennohito committed Dec 22, 2023
1 parent 3730a32 commit 3bf2349
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ object DeduplicateParagraphs {

val basicCols = (if (debug) {
joined.columns.filter {
case "parHash" => false
case "parHash" => true
case "exactFreq" | "nearFreq" => false
case _ => true
}
Expand Down Expand Up @@ -901,7 +901,7 @@ object DeduplicateParagraphs {

case class ProcessedDocument(text: String, filter: String)

private def processDocumentParts(
private def processDocumentParts(
args: Args,
docId: String,
parts: IndexedSeq[Paragraph]
Expand Down
70 changes: 70 additions & 0 deletions scripts/submit_all_compute_stats.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
submit_post2017() {
qsub -g gcf51199 -l rt_F=10 -l h_rt=4:00:00 submit_dedup_stage1.sh \
"/groups/gcf51199/cc/extracted/segment\=$1" \
/groups/gcf51199/cc/stats_raw_v2/segment=$1 \
500 4000
}

submit_post2017 CC-MAIN-2017-04
submit_post2017 CC-MAIN-2017-09
submit_post2017 CC-MAIN-2017-13
submit_post2017 CC-MAIN-2017-17
submit_post2017 CC-MAIN-2017-22
submit_post2017 CC-MAIN-2017-26
submit_post2017 CC-MAIN-2017-30
submit_post2017 CC-MAIN-2017-34
submit_post2017 CC-MAIN-2017-39
submit_post2017 CC-MAIN-2017-43
submit_post2017 CC-MAIN-2017-47
submit_post2017 CC-MAIN-2017-51
submit_post2017 CC-MAIN-2018-05
submit_post2017 CC-MAIN-2018-09
submit_post2017 CC-MAIN-2018-13
submit_post2017 CC-MAIN-2018-17
submit_post2017 CC-MAIN-2018-22
submit_post2017 CC-MAIN-2018-26
submit_post2017 CC-MAIN-2018-30
submit_post2017 CC-MAIN-2018-34
submit_post2017 CC-MAIN-2018-39
submit_post2017 CC-MAIN-2018-43
submit_post2017 CC-MAIN-2018-47
submit_post2017 CC-MAIN-2018-51
submit_post2017 CC-MAIN-2019-04
submit_post2017 CC-MAIN-2019-09
submit_post2017 CC-MAIN-2019-13
submit_post2017 CC-MAIN-2019-18
submit_post2017 CC-MAIN-2019-22
submit_post2017 CC-MAIN-2019-26
submit_post2017 CC-MAIN-2019-30
submit_post2017 CC-MAIN-2019-35
submit_post2017 CC-MAIN-2019-39
submit_post2017 CC-MAIN-2019-43
submit_post2017 CC-MAIN-2019-47
submit_post2017 CC-MAIN-2019-51
submit_post2017 CC-MAIN-2020-05
submit_post2017 CC-MAIN-2020-10
submit_post2017 CC-MAIN-2020-16
submit_post2017 CC-MAIN-2020-24
submit_post2017 CC-MAIN-2020-29
submit_post2017 CC-MAIN-2020-34
submit_post2017 CC-MAIN-2020-40
submit_post2017 CC-MAIN-2020-45
submit_post2017 CC-MAIN-2020-50
submit_post2017 CC-MAIN-2021-04
submit_post2017 CC-MAIN-2021-10
submit_post2017 CC-MAIN-2021-17
submit_post2017 CC-MAIN-2021-21
submit_post2017 CC-MAIN-2021-25
submit_post2017 CC-MAIN-2021-31
submit_post2017 CC-MAIN-2021-39
submit_post2017 CC-MAIN-2021-43
submit_post2017 CC-MAIN-2021-49
submit_post2017 CC-MAIN-2022-05
submit_post2017 CC-MAIN-2022-21
submit_post2017 CC-MAIN-2022-27
submit_post2017 CC-MAIN-2022-33
submit_post2017 CC-MAIN-2022-40
submit_post2017 CC-MAIN-2022-49
submit_post2017 CC-MAIN-2023-06
submit_post2017 CC-MAIN-2023-14
submit_post2017 CC-MAIN-2023-23
11 changes: 11 additions & 0 deletions scripts/submit_all_compute_stats_old.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
submit_pre2016() {
qsub -g gcf51199 -l rt_F=10 -l h_rt=4:00:00 submit_dedup_stage1.sh \
"/groups/gcf51199/cc2/extracted/$1" \
"/groups/gcf51199/cc/stats_raw_v2/segment=$1" \
500 4000
}

submit_pre2016 merged-2013
submit_pre2016 merged-2014
submit_pre2016 merged-2015
submit_pre2016 merged-2016
4 changes: 2 additions & 2 deletions scripts/submit_all_filter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ submit() {
qsub -g gcf51199 -l rt_F=10 -l h_rt=1:00:00 submit_filter_debug_2.sh \
"/groups/gcf51199/cc/extracted/segment\=$1" \
/groups/gcf51199/cc/stats_merged_v1/for_filter/all \
"/groups/gcf51199/cc/filtered_v1/segment=$1"
"/groups/gcf51199/cc/filtered_v2/segment=$1"
}

submit CC-MAIN-2017-04
Expand All @@ -23,7 +23,7 @@ submit CC-MAIN-2018-05
submit CC-MAIN-2018-09
submit CC-MAIN-2018-13
submit CC-MAIN-2018-17
# submit CC-MAIN-2018-22
submit CC-MAIN-2018-22
submit CC-MAIN-2018-26
submit CC-MAIN-2018-30
submit CC-MAIN-2018-34
Expand Down
2 changes: 2 additions & 0 deletions scripts/submit_dedup_stage1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#$ -j y
#$ -cwd
#$ -l USE_SSH=1
#$ -l USE_EXTRA_NETWORK=1

du -hs "$1" > /dev/null &

Expand Down Expand Up @@ -38,6 +39,7 @@ NUM_PARTITIONS_PROPAGATION=${4:-$(($NUM_PARTITIONS * 4))}
--conf spark.eventLog.dir=/scratch/$USER/spark-exlog \
--conf spark.local.dir=$SPARK_LOCAL_DIRS \
--conf spark.sql.shuffle.partitions=${NUM_PARTITIONS_PROPAGATION} \
--conf spark.sql.parquet.columnarReaderBatchSize=512 \
local://$UZUSHIO_JAR \
--input="$INPUT" \
--output="$OUTPUT" \
Expand Down
2 changes: 1 addition & 1 deletion scripts/submit_filter_debug_2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ NUM_PARTITIONS_PROPAGATION=4000
--filters=$SCRIPT_DIR/pipeline_02.conf \
--partitions=$NUM_PARTITIONS \
--execution=filter-debug \
-Pkenlm=/groups/gcf51199/filter/n-gram_model/kenlm_model_sudachi_filter.bin \
-Pkenlm=/groups/gcf51199/filter/n-gram_model/kenlm_merge-code_0.05_model.bin \
-Psudachi=/groups/gcf51199/resources/sudachi-dictionary-20230927/system_core.dic \
--format=json --compression=gzip --text-only

Expand Down

0 comments on commit 3bf2349

Please sign in to comment.