diff --git a/Makefile b/Makefile index bf32771..4304724 100644 --- a/Makefile +++ b/Makefile @@ -25,8 +25,14 @@ endif all:$(PROG) -miniprot:$(OBJS) main.o - $(CC) $(CFLAGS) $^ -o $@ $(LIBS) +libminiprot.a:$(OBJS) + $(AR) -csru $@ $(OBJS) + +miniprot:$(OBJS) main.o libminiprot.a + $(CC) $(CFLAGS) main.o -o $@ -L. -lminiprot $(LIBS) + +miniprot-lite:example.o libminiprot.a + $(CC) $(CFLAGS) $< -o $@ -L. -lminiprot $(LIBS) clean: rm -fr *.o a.out $(PROG) *~ *.a *.dSYM @@ -39,6 +45,7 @@ depend: align.o: mppriv.h miniprot.h nasw.h kalloc.h kseq.h bseq.o: kvec-km.h kalloc.h mppriv.h miniprot.h nasw.h kseq.h chain.o: mppriv.h miniprot.h nasw.h kalloc.h kseq.h +example.o: miniprot.h nasw.h kalloc.h kseq.h format.o: kseq.h mppriv.h miniprot.h nasw.h kalloc.h hit.o: mppriv.h miniprot.h nasw.h kalloc.h kseq.h index.o: mppriv.h miniprot.h nasw.h kalloc.h kseq.h kvec-km.h kthread.h diff --git a/example.c b/example.c new file mode 100644 index 0000000..3ffe7a3 --- /dev/null +++ b/example.c @@ -0,0 +1,64 @@ +// To compile: +// gcc -g -O2 example.c libminiprot.a -lz + +#include +#include +#include +#include +#include "miniprot.h" +#include "nasw.h" // needed for CIGAR +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +int main(int argc, char *argv[]) +{ + mp_idxopt_t iopt; + mp_mapopt_t mopt; + int n_threads = 3; + + mp_start(); // this set the translation table to "1" + //ns_make_tables(1); // set the translation table if not "1" + mp_verbose = 2; // disable message output to stderr + mp_idxopt_init(&iopt); + mp_mapopt_init(&mopt); + + if (argc < 3) { + fprintf(stderr, "Usage: miniprot-lite \n"); + return 1; + } + + // open query file for reading; you may use your favorite FASTA/Q parser + gzFile f = gzopen(argv[2], "r"); + assert(f); + kseq_t *ks = kseq_init(f); + + // open index reader + mp_idx_t *mi = mp_idx_load(argv[1], &iopt, n_threads); + if (mi != 0) { + mp_tbuf_t *tbuf = mp_tbuf_init(); // thread buffer; for multi-threading, allocate one tbuf for each thread + while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence + mp_reg1_t *reg; + int j, i, n_reg; + reg = mp_map(mi, ks->seq.l, ks->seq.s, &n_reg, tbuf, &mopt, 0); // get all hits for the query + for (j = 0; j < n_reg; ++j) { // traverse hits and print them out + mp_reg1_t *r = ®[j]; + assert(r->p); + const mp_ctg_t *ctg = &mi->nt->ctg[r->vid>>1]; + int64_t rs = r->vid&1? ctg->len - r->ve : r->vs; + int64_t re = r->vid&1? ctg->len - r->vs : r->ve; + printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, (int)ks->seq.l, r->qs, r->qe, "+-"[r->vid&1]); + printf("%s\t%ld\t%ld\t%ld\t%d\t%d\t0\tcg:Z:", ctg->name, (long)ctg->len, (long)rs, (long)re, r->p->n_iden * 3, r->p->blen); + for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings! + printf("%d%c", r->p->cigar[i]>>4, NS_CIGAR_STR[r->p->cigar[i]&0xf]); + putchar('\n'); + free(r->p); + } + free(reg); + } + mp_tbuf_destroy(tbuf); + mp_idx_destroy(mi); + } + kseq_destroy(ks); // close the query file + gzclose(f); + return 0; +} diff --git a/miniprot.h b/miniprot.h index 6013500..29e65d2 100644 --- a/miniprot.h +++ b/miniprot.h @@ -150,21 +150,137 @@ extern char *ns_tab_nt_i2c, *ns_tab_aa_i2c; extern uint8_t ns_tab_a2r[22], ns_tab_nt4[256], ns_tab_aa20[256], ns_tab_aa13[256]; extern uint8_t ns_tab_codon[64], ns_tab_codon13[64]; +/** + * Set the translation table and builtin timer + * + * Run this before all other miniprot routines. + */ void mp_start(void); +/** + * Initialize the default parameters for indexing + * + * @param io struct to initialize (out) + */ void mp_idxopt_init(mp_idxopt_t *io); + +/** + * Initialize the default parameters fr mapping + * + * @param mo struct to initialize (out) + */ void mp_mapopt_init(mp_mapopt_t *mo); + +/** + * Set frameshift and stop codon penalty + * + * This function changes the mp_mapopt_t::fs field and mp_mapopt_t::mat[] + * + * @param mo mapping options (in/out) + * @param fs frameshift and stop codon penalty (positive) + */ void mp_mapopt_set_fs(mp_mapopt_t *mo, int32_t fs); + +/** + * Set max intron length based on the genome size + * + * @param mo mapping options (in/out) + * @param gsize genome size + */ void mp_mapopt_set_max_intron(mp_mapopt_t *mo, int64_t gsize); + +/** + * Check the integrity of mapping parameters (not complete) + * + * @param mo mapping options + * + * @return 0 on success, or negative on failure + */ int32_t mp_mapopt_check(const mp_mapopt_t *mo); +/** + * Load or build an index + * + * If _fn_ corresponds to a FASTA file, build the index using _io_ and + * _n_threads_. If _fn_ corresponds to a prebuilt index file, load the index + * and ignore _io_ and _n_threads_. + * + * @param fn FASTA or index file name + * @param io indexing options + * @param n_threads number of threads + * + * @return the index + */ mp_idx_t *mp_idx_load(const char *fn, const mp_idxopt_t *io, int32_t n_threads); + +/** + * Deallocate the index + * + * @param mi the index + */ void mp_idx_destroy(mp_idx_t *mi); + +/** + * Write an index to disk + * + * @param fn index file name + * @param mi the index + * + * @return 0 on success or negative on failure + */ int mp_idx_dump(const char *fn, const mp_idx_t *mi); + +/** + * Read an index file + * + * @param fn index file name + * + * @return the index, or NULL on failure + */ mp_idx_t *mp_idx_restore(const char *fn); -void mp_idx_print_stat(const mp_idx_t *mi, int32_t max_occ); + +/** + * Load a splice score file + * + * @param nt sequences in mp_idx_t::nt (in/out) + * @param fn splice score file name; can be optionally gzip'd + * @param max_sc cap splice scores at this parameter + * + * @return 0 on success, or negative on failure + */ int32_t mp_ntseq_read_spsc(mp_ntdb_t *nt, const char *fn, int32_t max_sc); +/** + * Align a protein sequence against the index + * + * @param mi the index + * @param qlen protein length + * @param seq protein sequence + * @param n_reg number of alignments (out) + * @param b thread-local buffer; threads shouldn't share buffers (can be NULL) + * @param opt mapping parameters + * @param qname query name (used for breaking ties) + * + * @return list of alignments + */ +mp_reg1_t *mp_map(const mp_idx_t *mi, int qlen, const char *seq, int *n_reg, mp_tbuf_t *b, const mp_mapopt_t *opt, const char *qname); + +/** + * Initialize a thread-local buffer + * + * @return the buffer + */ +mp_tbuf_t *mp_tbuf_init(void); + +/** + * Deallocate a thread-local buffer + * + * @param b the buffer + */ +void mp_tbuf_destroy(mp_tbuf_t *b); + +// ignore the following for now +void mp_idx_print_stat(const mp_idx_t *mi, int32_t max_occ); int32_t mp_map_file(const mp_idx_t *idx, const char *fn, const mp_mapopt_t *opt, int n_threads); #ifdef __cplusplus