Skip to content

Commit

Permalink
Deduplicate PAPI v2 input localizations. [BA-5979] [force ci] (broadi…
Browse files Browse the repository at this point in the history
  • Loading branch information
mcovarr authored Sep 9, 2019
1 parent d42bacb commit 4abcd1b
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: dedup_localizations_papi_v2
testFormat: workflowsuccess
backends: [Papiv2]

files {
workflow: dedup_localizations_papi_v2/dedup_localizations_papi_v2.wdl
}

metadata {
workflowName: dedup_localizations_papi_v2
status: Succeeded
"outputs.dedup_localizations_papi_v2.check_log.num_input_localizations": 1
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
version 1.0

workflow dedup_localizations_papi_v2 {
call producer
call consumer { input: first = producer.data, second = producer.data }
call check_log { input: out_file_path = consumer.out, log_file_name = "consumer.log" }
}

task producer {
command {
echo "Here is some data." > data.txt
}

runtime {
docker: "ubuntu:latest"
}

output {
File data = "data.txt"
}
}

task consumer {
input {
File first
File second
}

command {
# noop
}

runtime {
docker: "ubuntu:latest"
}

output {
File out = stdout()
}
}

task check_log {
input {
String out_file_path
String log_file_name
}
String file_log = sub(out_file_path, "/stdout$", "/" + log_file_name)
command {
set -euo pipefail
gsutil cp ~{file_log} log.txt
set +e
grep 'Localizing input gs://cloud-cromwell-dev-self-cleaning/cromwell_execution/travis/dedup_localizations_papi_v2/' log.txt | grep -c "data.txt"
}
output {
File out = stdout()
Int num_input_localizations = read_int(stdout())
}
runtime { docker: "google/cloud-sdk" }
}
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ class PipelinesApiAsyncBackendJobExecutionActor(standardParams: StandardAsyncExe
}

val filesByContainerParentDirectory = filesWithSameNames.groupBy(_.containerPath.parent.toString)
// Deduplicate any inputs since parallel localization can't deal with this.
val uniqueFilesByContainerParentDirectory = filesByContainerParentDirectory map { case (p, fs) => p -> fs.toSet }

val filesWithSameNamesTransferBundles: List[String] = filesByContainerParentDirectory.toList map { case (containerParent, filesWithSameParent) =>
val filesWithSameNamesTransferBundles: List[String] = uniqueFilesByContainerParentDirectory.toList map { case (containerParent, filesWithSameParent) =>
val arrayIdentifier = s"files_to_localize_" + DigestUtils.md5Hex(bucket + containerParent)
val entries = filesWithSameParent.map(_.cloudPath) mkString("\"", "\"\n| \"", "\"")

Expand Down

0 comments on commit 4abcd1b

Please sign in to comment.