-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathbeast2-xml.py
executable file
·215 lines (190 loc) · 6.65 KB
/
beast2-xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env python
from __future__ import print_function, division
import argparse
from itertools import chain
from dark.reads import addFASTACommandLineOptions, parseFASTACommandLineOptions
from beast2xml import BEAST2XML
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=(
"Given FASTA on stdin (or in a file via the --fastaFile "
"option), write an XML BEAST2 input file on stdout."
),
)
# A mutually exclusive group for either --clock_model or --template_file.
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--clock_model",
metavar="MODEL",
default="strict",
choices=("random-local", "relaxed-exponential", "relaxed-lognormal", "strict"),
help=(
"Specify the clock model. Possible values are "
"'random-local', 'relaxed-exponential', 'relaxed-lognormal', "
"or 'strict'"
),
)
group.add_argument(
"--template_file", metavar="FILENAME", help="The XML template file to use."
)
parser.add_argument(
"--chain_length", type=int, metavar="LENGTH", help="The MCMC chain length."
)
parser.add_argument(
"--age",
metavar="ID=N",
nargs="+",
action="append",
help=(
"The age of a sequence. The format is a sequence id, an equals "
"sign, then the age. For convenience, just the first part "
"of a full sequence id (i.e., up to the first space) may be given. "
"May be specified multiple times."
),
)
parser.add_argument(
"--default_age",
type=float,
default=0.0,
metavar="N",
help=(
"The age to use for sequences that are not explicitly given an "
"age via --age."
),
)
parser.add_argument(
"--date_unit",
metavar="UNIT",
choices=("day", "month", "year"),
default="year",
help=("Specify the date unit. Possible values are " "'day', 'month', or 'year'."),
)
parser.add_argument(
"--date_direction",
metavar="DIRECTION",
choices=("backward", "forward"),
default="backward",
help=(
"Specify whether dates are back in time from the present or "
"forward in time from some point in the past. Possible values are "
"'forward' or 'backward'."
),
)
parser.add_argument(
"--log_file_basename",
default="beast-output",
metavar="BASE-FILENAME",
help=(
'The base filename to write logs to. A ".log" or ".trees" suffix '
"will be appended to this to make complete log file names."
),
)
parser.add_argument(
"--trace_log_every",
type=int,
default=2000,
metavar="N",
help="How often to write to the trace log file.",
)
parser.add_argument(
"--tree_log_every",
type=int,
default=2000,
metavar="N",
help="How often to write to the tree log file.",
)
parser.add_argument(
"--screen_log_every",
type=int,
default=2000,
metavar="N",
help="How often to write logging to the screen (i.e., terminal).",
)
parser.add_argument(
"--mimic_beauti",
action="store_true",
help=(
"If specified, add attributes to the <beast> tag that mimic what "
"BEAUti uses so that BEAUti will be able to load the XML."
),
)
parser.add_argument(
"--sequence_id_date_regex",
metavar="REGEX",
help=(
"A regular expression that will be used to capture sequence dates "
"from their ids. The regular expression must have three named "
'capture regions ("year", "month", and "day"). Regular expression '
"matching is anchored to the start of the id string (i.e., "
"Python's re.match function is used, not the re.search function), "
"so you must explicitly match the id from its beginning. For "
"example, you might use --sequence_id_date_regex "
r"'^.*_(?P<year>\d\d\d\d)-(?P<month>\d\d)-(?P<day>\d\d)'."
),
)
parser.add_argument(
"--sequence_id_age_regex",
metavar="REGEX",
help=(
"A regular expression that will be used to capture sequence ages "
"from their ids. The regular expression must have a single "
"capture region. Regular expression matching is anchored to the "
"start of the id string (i.e., Python's re.match function is used, "
"not the re.search function), so you must explicitly match the id "
"from its beginning. For example, you might use "
r"--sequence_id_age_regex '^.*_(\d+)$' to capture an age preceded by "
"an underscore at the very end of the sequence id. If "
"--sequence_id_date_regex is also given, it takes precedence when "
"matching sequence ids."
),
)
parser.add_argument(
# Note that --sequence_id_date_regexMayNotMatch is maintained here for
# backwards compatibility.
"--sequenceIdRegexMayNotMatch",
"--sequence_id_date_regexMayNotMatch",
action="store_false",
dest="sequence_id_regex_must_match",
help=(
"If specified (and --sequence_id_date_regex or --sequence_id_age_regex is "
"given) it will not be considered an error if a sequence id does "
"not match the given regular expression. In that case, sequences "
"will be assigned an age of zero unless one is given via --age."
),
)
addFASTACommandLineOptions(parser)
args = parser.parse_args()
reads = parseFASTACommandLineOptions(args)
xml = BEAST2XML(
template=args.template_file,
clock_model=args.clock_model,
sequence_id_date_regex=args.sequence_id_date_regex,
sequence_id_age_regex=args.sequence_id_age_regex,
sequence_id_regex_must_match=args.sequence_id_regex_must_match,
date_unit=args.date_unit,
)
xml.add_sequences(reads)
if args.age:
# Flatten lists of lists that we get from using both nargs='+' and
# action='append'. We use both because it allows people to use --age on the
# command line either via "--age id1=33 --age id2=21" or "--age id1=33
# id2=21", or a combination of these. That way it's not necessary to
# remember which way you're supposed to use it and you also can't be hit by
# the subtle problem encountered in
# https://github.com/acorg/dark-matter/issues/453
ages = list(chain.from_iterable(args.age))
for ageInfo in ages:
id_, age = ageInfo.rsplit(sep="=", maxsplit=1)
xml.add_age(id_.strip(), float(age.strip()))
print(
xml.to_string(
chain_length=args.chain_length,
default_age=args.default_age,
date_direction=args.date_direction,
log_file_basename=args.log_file_basename,
trace_log_every=args.trace_log_every,
tree_log_every=args.trace_log_every,
screen_log_every=args.screen_log_every,
mimic_beauti=args.mimic_beauti,
).replace('" /><sequence', '" />\n <sequence')
)