diff --git a/README.md b/README.md index 04c9e2a2..e44c41ae 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,9 @@ imported from foreign languages and unified under a common type system. See [the manual](https://morloc-project.github.io/docs) for more information. If you want to get straight to playing with code, go through the steps in the -installation section and then visit the project `demo/01_sequence_analysis` -or the less documented `demo/02_flu`. +installation section and then visit `morloc` examples repo +(here)[https://github.com/morloc-project/examples]. The `fasta` example is a +well-annotated is a nice place to start. ## Status @@ -21,11 +22,11 @@ welcome. ## Running morloc -`morloc` should run on Linux and macOS. For Windows, I suggest using [Windows Subsystem for +`morloc` should run on Linux and MacOS. For Windows, I suggest using [Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/install). -The easiest way to use `morloc` is through containers. Unless you love running -with daemons, I recommend using podman. +The easiest way to start using `morloc` is through containers. Unless you love +running with daemons, I recommend using podman. A container with the morloc executable and batteries included can be retrieved from the GitHub container registry as follows: diff --git a/demos/01_sequence_analysis/Makefile b/demos/01_sequence_analysis/Makefile deleted file mode 100644 index ea51e08f..00000000 --- a/demos/01_sequence_analysis/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -all: - morloc make main.loc - -clean: - rm -rf nexus.py pool.cpp *out */nexus.py */*out */pool.cpp revcom.fasta - -run: - ./nexus.py fastaRevcom '"revcom.fasta"' '"test.fasta"' && cat revcom.fasta diff --git a/demos/01_sequence_analysis/README.md b/demos/01_sequence_analysis/README.md deleted file mode 100644 index bef8696d..00000000 --- a/demos/01_sequence_analysis/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# `morloc` DNA sequence example - -Before running this example, be sure you have followed the installation -instructions in the top-level README. Then install the `cppbase` morloc module -with `morloc install cppbase`. - -First open the `main.loc` file to see the top-level `morloc` program this we -will build. Then build the project as follows: - -``` sh -morloc make main.loc -``` - -This should generate the following files: - * pool.cpp - the generate C++ source code for (de)serialization and function composition - * pool-cpp.out - the compiled code - * nexus.py - the user interface - -Feel free to skim the `pool.cpp` file to see what the morlocks are up to -underground. You can also read the `nexus.py` script (which is just a Python -script). Both of these files are generated based on the `morloc` template -`main.loc`. - -To access the usage statement, run `./nexus.py -h`. This will list all exported -commands and the types of their input and output. Currently the help statement -is pretty minimal, but I'll remedy this in the near future. - -Commands can be called as follows: - -``` sh -$ ./nexus.py fastaRevcom '"test.fasta"' -">Unicorn -TGTATCTGTATCTGTATCTGTATC ->Dragon -TGTATCTGTATCTGTATCTGTATCTGTATCTGTATCTGTATCTGTATC -" -``` - -Why the weird quoting? The inputs to a morloc program are raw JSON data or files -containing JSON data. Raw string inputs need two levels of quotation since one -level is removed by Bash (hence, '"test.fasta"'). The returned value is also a -JSON string, so it is quoted. The `write_fasta` function could alternatively be -written to print directly to STDOUT instead of returning a string. - -If an argument does not parse as JSON, the nexus will check to see if it is a -readable file, if so it will treat the contents of the file as the JSON -input. So if you write `./nexus.py fastaRevcom test.fasta`, then the nexus will -open the fasta file and then treat the contents of the file as the -filename. - -To learn more about module construction, visit the `bio` and `fasta` modules in -this folder. diff --git a/demos/01_sequence_analysis/bio/bio.hpp b/demos/01_sequence_analysis/bio/bio.hpp deleted file mode 100644 index 3ecfc635..00000000 --- a/demos/01_sequence_analysis/bio/bio.hpp +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __BIO_HPP__ -#define __BIO_HPP__ - -#include - -std::string revcom(std::string seq){ - size_t N = seq.size(); - std::string revSeq(N, '*'); - for (size_t i = 0; i < N; i++){ - switch(seq[i]){ - case 'A': - revSeq[N - i - 1] = 'T'; - break; - case 'T': - revSeq[N - i - 1] = 'A'; - break; - case 'G': - revSeq[N - i - 1] = 'C'; - break; - case 'C': - revSeq[N - i - 1] = 'G'; - break; - default: - revSeq[N - i - 1] = seq[i]; - break; - } - } - return revSeq; -} - -#endif diff --git a/demos/01_sequence_analysis/bio/main.loc b/demos/01_sequence_analysis/bio/main.loc deleted file mode 100644 index 3a810eb0..00000000 --- a/demos/01_sequence_analysis/bio/main.loc +++ /dev/null @@ -1,8 +0,0 @@ -module bio (revcom) - -source Cpp from "bio.hpp" ("revcom") - -type Cpp => Str = "std::string" - --- Take the reverse complement of a DNA sequence -revcom :: Str -> Str diff --git a/demos/01_sequence_analysis/bio/package.yaml b/demos/01_sequence_analysis/bio/package.yaml deleted file mode 100644 index bc5db05e..00000000 --- a/demos/01_sequence_analysis/bio/package.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name: bio -version: 0.0.0 -homepage: null -synopsis: bioinformatics module -description: Basic bioinformatics functions for genome analysis and all that. -category: bioinformatics -license: MIT -author: "Zebulun Arendsee" -maintainer: "zbwrnz@gmail.com" -github: null -bug-reports: null diff --git a/demos/01_sequence_analysis/fasta/fastaIO.hpp b/demos/01_sequence_analysis/fasta/fastaIO.hpp deleted file mode 100644 index bea2f577..00000000 --- a/demos/01_sequence_analysis/fasta/fastaIO.hpp +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef __FASTAIO_HPP__ -#define __FASTAIO_HPP__ - -#include -#include -#include -#include -#include - -std::vector> readFasta(std::string filename){ - std::vector seq; - std::vector def; - std::string line; - std::ifstream fastaFile(filename); - bool begin = false; - if (fastaFile.is_open()) { - while (std::getline(fastaFile,line)) { - if (line.size() == 0) { - continue; - } - if (line[0] == '>') { - def.push_back(line.substr(1,line.size()-1)); - begin = true; - } else { - if (begin){ - seq.push_back(line.substr(0, line.size())); - } else { - seq[seq.size()-1].append(line); - } - begin = false; - } - } - fastaFile.close(); - } - std::vector> out; - for (size_t i = 0; i < seq.size(); i++){ - out.push_back(std::make_tuple(def[i], seq[i])); - } - return out; -} - -std::string writeFasta(std::string filename, std::vector> bioseq){ - std::ofstream fh; - fh.open(filename); - for(size_t i = 0; i < bioseq.size(); i++){ - fh << ">" << std::get<0>(bioseq[i]) << '\n'; - fh << std::get<1>(bioseq[i]) << '\n'; - } - fh.close(); - return filename; -} - -#endif diff --git a/demos/01_sequence_analysis/fasta/main.loc b/demos/01_sequence_analysis/fasta/main.loc deleted file mode 100644 index 91c177d1..00000000 --- a/demos/01_sequence_analysis/fasta/main.loc +++ /dev/null @@ -1,45 +0,0 @@ -module fasta - ( readFasta - , writeFasta - ) - -{- Define the Str type alias from cppbase, this allows 'Str' to be used in -place of the C++ value of 'std::string' -} -type Cpp => Str = "std::string" -type Cpp => List a = "std::vector<$1>" a -type Cpp => Tuple2 a b = "std::tuple<$1,$2>" a b - --- Take the reverse complement of a DNA sequence -revcom :: Str -> Str - -{- A FASTA file contains a header, which is a string describing a sequence, and -a biological sequence (DNA or protein). The type [(a, Str)] generalizes this by -allowing any type to describe the sequence. Many libraries create complex OOP -hierarchies to describe biological sequences. While `morloc` can work with the -OOP paradigm (see the `test-suite/golden-tests/object-1*` tests), I strongly -recommend this simpler more elegant approach. For one, all the rich functions -for mapping over and parallelizing list operations will be immediately usable. -Also the type is so universal that most languages will have no problem -supporting it. -} -type (Fasta a) = [(a, Str)] - -{- Currently, the `type` keyword introduces a simple type aliases. The term -'Filename' will disappear early in the compile process. So currently it is -useful only to increase readability. However, in the future I will add semantic -layer over the type system that will allow rich knowledge to be encoded about -the type terms. -} -type Filename = Str - -{- Source and export functions for reading and writing the universal -representations of sequence, [(a,Str)], to the FASTA format used commonly in -bioinformatics. -} -source Cpp from "fastaIO.hpp" ("readFasta", "writeFasta") - -{- `readFasta` and `writeFasta` are both IO operations. The `morloc` typesystem -currently has no mechanism to describe this (e.g., no IO monad). I haven't yet -settled on an effect handling system. -} -readFasta :: Filename -> Fasta Str - -{- While a more idiomatic C++ implementation would be to return `void`, -returning the filename makes it easier to continue the pipeline. -} -writeFasta :: Filename -> Fasta Str -> Filename diff --git a/demos/01_sequence_analysis/fasta/package.yaml b/demos/01_sequence_analysis/fasta/package.yaml deleted file mode 100644 index d2c31d59..00000000 --- a/demos/01_sequence_analysis/fasta/package.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name: fasta -version: 0.0.0 -homepage: null -synopsis: FASTA sequence handling -description: Basic support for reading, writing, and linting FASTA files -category: bioinformatics, file handling -license: MIT -author: "Zebulun Arendsee" -maintainer: "zbwrnz@gmail.com" -github: null -bug-reports: null diff --git a/demos/01_sequence_analysis/main.loc b/demos/01_sequence_analysis/main.loc deleted file mode 100644 index de9c1da0..00000000 --- a/demos/01_sequence_analysis/main.loc +++ /dev/null @@ -1,23 +0,0 @@ --- Declare functions for export. These are the functions that are exported when --- this module is imported by another module AND the functions that become --- user-facing subcommands when this module is built as an executable. -module main - ( readFasta - , writeFasta - , revcom - , fastaRevcom - ) - - --- Import functions from the 'fasta' and 'bio' modules in the working directory -import fasta (readFasta, writeFasta) -import bio (revcom) - --- Import the local cppbase module. To install this module run: --- $ morloc install cppbase --- This command will install the module in the folder ~/.morloc/lib --- `map_val` has the type `(b -> c) -> [(a,b)] -> [(a,c)]` -import cppbase (map, fst, snd) - --- take the reverse complement of all entries in a fasta file -fastaRevcom outfile filename = writeFasta outfile (map (\x -> (fst x, revcom (snd x))) (readFasta filename)) diff --git a/demos/01_sequence_analysis/test.fasta b/demos/01_sequence_analysis/test.fasta deleted file mode 100644 index 8440e954..00000000 --- a/demos/01_sequence_analysis/test.fasta +++ /dev/null @@ -1,8 +0,0 @@ ->Unicorn -GATACAGATACA -GATACAGATACA ->Dragon -GATACAGATACA -GATACAGATACA -GATACAGATACA -GATACAGATACA diff --git a/demos/02_flu/.gitignore b/demos/02_flu/.gitignore deleted file mode 100644 index f45b7582..00000000 --- a/demos/02_flu/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -log -*pdf -*rda -tree.json -__pycache__/ -nexus.py -pool-cpp.out -pool.R -pool.cpp -pool.py diff --git a/demos/02_flu/Makefile b/demos/02_flu/Makefile deleted file mode 100644 index 03977672..00000000 --- a/demos/02_flu/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -all: nexus.py - ./nexus.py makeTree data/config.json > tree.json - ./nexus.py plot data/config.json tree.json - -nexus.py: - morloc make main.loc - -clean: - rm -rf pool* nexus* *pdf *rda __pycache__ tree.json diff --git a/demos/02_flu/data/config.json b/demos/02_flu/data/config.json deleted file mode 100644 index 5b3e6a67..00000000 --- a/demos/02_flu/data/config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ "mindate" : "2021/01/01" -, "maxdate" : "2021/01/14" -, "reffile" : "data/refs.txt" -, "treefile" : "tree.pdf" -, "email" : "wena@mailinator.com" -, "query" : "Influenza+A+Virus[Organism]+H3N2[ALL]+HA[ALL]" -} diff --git a/demos/02_flu/data/refs.txt b/demos/02_flu/data/refs.txt deleted file mode 100644 index e33bd3ad..00000000 --- a/demos/02_flu/data/refs.txt +++ /dev/null @@ -1,29 +0,0 @@ -KU976624 1A.3.1 -MG917052 2010-human_like -MG754502 2016-human_like -AF251403 C_I -MG836803 C_IV -MG663067 C_IVA -KY465581 C_IVB -KX264751 C_IVC -KP186097 C_IVD -KX150756 C_IVE -KR859577 C_IVF -KU976737 Eurasian_avian-like -MH814933 LAIV_C-I -MH802648 LAIV_gamma2-beta-like -MG669449 alpha -MF092751 beta -KU976748 delta-like -MG720203 delta1 -KP461657 delta2 -MG870268 gamma -KY859939 gamma2 -AY661190 humanVaccine -KJ609206 humanVaccine -KJ942616 humanVaccine -KJ942680 humanVaccine -MG974450 humanVaccine -MG870266 pdm -CY121680 pdm-vaccine -KU509703 pdm-vaccine diff --git a/demos/02_flu/lib/classify/main.loc b/demos/02_flu/lib/classify/main.loc deleted file mode 100644 index b2185815..00000000 --- a/demos/02_flu/lib/classify/main.loc +++ /dev/null @@ -1,41 +0,0 @@ -module lib.classify (classify) - -import lib.flutypes (Clade) - --- Import generic functions -import base - ( id - , snd - , ifelse - , eq - , size - , fst - , branch - , head - , const - , unique - , ne - , filter - ) - --- Import the tree type and generic functions for operating on it -import bio.tree - ( RootedTree - , pullNode - , push - ) - --- Traverse the phylogenetic tree setting clade labels -classify n e a :: RootedTree n e (a, Clade) -> RootedTree Str e (a, Clade) -classify - = push id passClade setLeaf - . pullNode snd pullClade - where - passClade parent edge child = (edge, ifelse (eq 0 (size child)) parent child) - setLeaf parent edge leaf = (edge, (fst leaf, parent)) - pullClade xs - = branch (eq 1 . size) head (const "") seenClades - where - seenClades = ( unique - . filter (ne 0 . size) - ) xs diff --git a/demos/02_flu/lib/flutypes/main.loc b/demos/02_flu/lib/flutypes/main.loc deleted file mode 100644 index 52e134ba..00000000 --- a/demos/02_flu/lib/flutypes/main.loc +++ /dev/null @@ -1,9 +0,0 @@ -module lib.flutypes (*) - -import types(Str) - --- Type aliases -type Accession = Str -type Clade = Str -type Sequence = Str -type Date = Str diff --git a/demos/02_flu/lib/retrieve/entrez.hpp b/demos/02_flu/lib/retrieve/entrez.hpp deleted file mode 100644 index 7294324e..00000000 --- a/demos/02_flu/lib/retrieve/entrez.hpp +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef __morloc_flucase_entrez_helpers_hpp__ -#define __morloc_flucase_entrez_helpers_hpp__ - -#include -#include -#include - -using json = nlohmann::ordered_json; - -std::string setLeafName(std::tuple meta) { - json jsonObj; - std::string clade; - std::tie(jsonObj, clade) = meta; - - return clade + "|" + jsonObj["GBSeq_primary-accession"].get() + "|" + jsonObj["GBSeq_length"].get(); -} - -#endif diff --git a/demos/02_flu/lib/retrieve/entrez.py b/demos/02_flu/lib/retrieve/entrez.py deleted file mode 100644 index 9f831ab6..00000000 --- a/demos/02_flu/lib/retrieve/entrez.py +++ /dev/null @@ -1,21 +0,0 @@ -# parseRecord :: JsonObj -> (JsonObj, Sequence) -def parseRecord(jsonObj): - sequence = jsonObj["GBSeq_sequence"].upper() - del jsonObj["GBSeq_sequence"] - return (jsonObj, sequence) - -# labelRef :: Map Accession Clade -> JsonObj -> (JsonObj, Clade) -def labelRef(cladeMap, jsonObj): - accession = jsonObj["GBSeq_primary-accession"] - if accession in cladeMap: - return (jsonObj, cladeMap[accession]) - else: - return (jsonObj, "") - -# setLeafName :: (JsonObj, Clade) -> Str -def setLeafName(meta): - (jsonObj, clade) = meta - return ( clade + "|" + - jsonObj["GBSeq_primary-accession"] + "|" + - jsonObj["GBSeq_length"] - ) diff --git a/demos/02_flu/lib/retrieve/main.loc b/demos/02_flu/lib/retrieve/main.loc deleted file mode 100644 index 67b408f7..00000000 --- a/demos/02_flu/lib/retrieve/main.loc +++ /dev/null @@ -1,75 +0,0 @@ -module lib.retrieve (retrieve, setLeafName, FluConfig) - -import lib.flutypes - ( Accession - , Clade - , Date - , Sequence - ) - --- Import functions for accessing public sequence records -import bio.db - ( searchEntrez as fetchIds - , nucleotideAccessionToJson as fetchRecords - , EntrezSearchConfig - , EntrezFetchConfig - ) - --- Import generic functions -import base (map, concat, shard, join, keys, onFst, sleep, readMap) -import json (JsonObj) -import types (Filename) - - --- Define the configuration record -record FluConfig = FluConfig - { mindate :: Date - , maxdate :: Date - , reffile :: Filename - , treefile :: Filename - , query :: Str - , email :: Str - } - --- Specify the representation of this record in Python and R -record Py => FluConfig = "dict" -record R => FluConfig = "list" - --- Source python functions for dealing with Entrez records -source Py from "entrez.py" - ( "parseRecord" - , "labelRef" - , "setLeafName" - ) - --- Source C++ alternative for one of these functions -source Cpp from "entrez.hpp" ("setLeafName") - --- Define the general type of each function -parseRecord :: JsonObj -> (JsonObj, Sequence) -labelRef :: Map Accession Clade -> JsonObj -> (JsonObj, Clade) -setLeafName :: (JsonObj, Sequence) -> Str - --- Retrieve sequence data from Entrez and tag reference strains -retrieve :: FluConfig -> [((JsonObj, Clade), Sequence)] -retrieve config = - ( map (onFst (labelRef refmap)) - . concat - . map ( map parseRecord - . sleep 1.0 - . fetchRecords fetchConfig - ) - . shard 30 - . join (keys refmap) - . fetchIds searchConfig - ) config@query - where - searchConfig = - { email = config@email - , db = "nuccore" - , mindate = config@mindate - , maxdate = config@maxdate - , retmax = 1000 - } - fetchConfig = { email = config@email } - refmap = readMap config@reffile diff --git a/demos/02_flu/lib/treeplot/main.loc b/demos/02_flu/lib/treeplot/main.loc deleted file mode 100644 index 04a74580..00000000 --- a/demos/02_flu/lib/treeplot/main.loc +++ /dev/null @@ -1,7 +0,0 @@ -module lib.treeplot (plotTree) - -import bio.tree(RootedTree) -import types (Filename) - -source R from "plot-tree.R" ("plotTree") -plotTree :: Filename -> RootedTree Str Real Str -> () diff --git a/demos/02_flu/lib/treeplot/plot-tree.R b/demos/02_flu/lib/treeplot/plot-tree.R deleted file mode 100644 index 6f24b1b3..00000000 --- a/demos/02_flu/lib/treeplot/plot-tree.R +++ /dev/null @@ -1,9 +0,0 @@ -library(ape) - -plotTree <- function(outputpdffilename, tree) { - pdf(outputpdffilename, width = 8, height = length(tree$tip.label) * 0.1) - par(cex = 0.7) # Adjusts the label font size for readability - plot(tree, show.tip.label = TRUE, cex = 0.7) # Plots the tree with tip labels - dev.off() # Closes the PDF device - NULL -} diff --git a/demos/02_flu/main.loc b/demos/02_flu/main.loc deleted file mode 100644 index e1d6ef4b..00000000 --- a/demos/02_flu/main.loc +++ /dev/null @@ -1,17 +0,0 @@ -module flucase (plot, makeTree) - -import types (Filename) - -import lib.retrieve (retrieve, setLeafName, FluConfig) -import lib.classify (classify) -import lib.treeplot (plotTree) -import lib.flutypes (Date, Clade) - -import bio.algo (upgma) -import bio.tree (treeBy, mapLeaf, RootedTree) - -plot :: FluConfig -> RootedTree Clade Real (JsonObj, Clade) -> () -plot config = plotTree config@treefile . mapLeaf setLeafName - -makeTree :: FluConfig -> RootedTree Str Real (JsonObj, Clade) -makeTree = classify . treeBy upgma . retrieve