forked from YPARK/gtex-fqtl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil_subset_sim_data.sh
executable file
·65 lines (43 loc) · 1.22 KB
/
util_subset_sim_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/bin/bash -l
# Some methods do not know how to handle missing values.
# So, we just subset simulated full data
if [ $# -lt 3 ]; then
cat <<EOF
datahdr=\$1
n=\$2
rseed=\$3
EOF
exit 1
fi
printf "[%s] take subsets\n\n" "$(date)"
datahdr=$1
n=$2
rseed=$3
cat <<EOF
datahdr=$1
n=$2
rseed=$3
EOF
get_seeded_random () {
seed="$1"
openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
</dev/zero 2>/dev/null
}
[ -f $datahdr.fam ] || exit 1
ntot=$(cat $datahdr.fam| wc -l)
[ ${ntot} -ge $n ] || exit 1
cat $datahdr.fam | awk '{ print NR FS $0 }' | \
shuf -n${n} --random-source=<(get_seeded_random ${rseed}) | \
sort -k1n > $datahdr.random.$n.ind
cat $datahdr.random.$n.ind | cut -d' ' -f 2- > $datahdr.random.$n.fam
./bin/plink --bfile $datahdr --make-bed \
--keep-fam $datahdr.random.$n.fam \
--out $datahdr.random.$n || \
exit 1
zcat $datahdr.yfull.txt.gz | \
awk -vROWS=$(cat $datahdr.random.$n.ind | awk '{ if(NR > 1) printf ","; printf $1 }') \
-f util_subset_rows.awk | \
gzip > $datahdr.random.$n.y.txt.gz
[ -f $datahdr.random.$n.y.txt.gz ] || exit 1
[ $(zcat $datahdr.random.$n.y.txt.gz | wc -l) -eq ${n} ] || exit 1
printf "[%s] subsets taken n = %d.\n\n" "$(date)" ${n}