diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index c826a4e..8377b3d 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -35,6 +35,7 @@ jobs: - name: Render Quarto uses: quarto-dev/quarto-actions/render@v2 + - name: Publish to GitHub Pages if: github.event_name != 'pull_request' uses: quarto-dev/quarto-actions/publish@v2 @@ -42,4 +43,22 @@ jobs: render: false target: gh-pages env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Commit PDF + if: github.event_name != 'pull_request' + uses: EndBug/add-and-commit@v9 + with: + add: '_manuscript/index.pdf' + author_name: 'GitHub Actions' + message: 'Add poster.pdf at ${{ github.sha }}' + + + - name: Commit DOCX + if: github.event_name != 'pull_request' + uses: EndBug/add-and-commit@v9 + with: + add: '_manuscript/index.docx' + author_name: 'GitHub Actions' + message: 'Add poster.pdf at ${{ github.sha }}' + diff --git a/_quarto.yml b/_quarto.yml index 3219da3..ae88703 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -10,8 +10,11 @@ format: hypothesis: true toc: true - # docx: default - # jats: default + docx: + toc: true + number-sections: true + highlight-style: github + pdf: default number-sections: true diff --git a/index.qmd b/index.qmd index 3caf5a0..3f1b395 100644 --- a/index.qmd +++ b/index.qmd @@ -7,16 +7,16 @@ heavy construction. Recent progress in machine learning and artificial intelligence promises to advance research and understanding across a wide range of fields and -activities. In tandem, an increased awareness of the importance of open data -for reproducibility and scientific transparency is making inroads in fields -that have not traditionally produced large publicly available datasets. Data -sharing requirements from publishers and funders, as well as from other -stakeholders, have also created pressure to make datasets with research and/or -public interest value available through digital repositories. However, to make -the best use of existing data, and facilitate the creation of useful future +activities. In tandem, increased awareness of the importance of open data for +reproducibility and scientific transparency is making inroads in fields that +have not traditionally produced large publicly available datasets. Data sharing +requirements from publishers and funders, as well as from other stakeholders, +have also created pressure to make datasets with research and/or public +interest value available through digital repositories. However, to make the +best use of existing data, and facilitate the creation of useful future datasets, robust, interoperable and usable standards need to evolve and adapt over time. The open-source development model provides significant potential -benefits to the process of standard creation and adaptation. In particular, +benefits to the process of standard creation and adaptation. In particular, the development and adaptation of standards can use long-standing socio-technical processes that have been key to managing the development of software, and allow incorporating broad community input into the formulation of these standards. By @@ -26,7 +26,7 @@ validation), processes such as automated testing and continuous integration, which have been important in the development of open-source software, can be adopted in defining data and metadata standards as well. Similarly, open-source governance provides a range of stakeholders a voice in the development of -standards, potentially enabling use-cases and concerns that would not be taken +standards, potentially enabling use cases and concerns that would not be taken into account in a top-down model of standards development. On the other hand, open-source models carry unique risks that need to be incorporated into the process. @@ -34,6 +34,9 @@ process. {{< include sections/01-introduction.qmd >}} {{< include sections/02-challenges.qmd >}} +{{< include sections/xx-use-cases.qmd >}} +{{< include sections/xx-cross-sector.qmd >}} {{< include sections/03-recommendations.qmd >}} +{{< include sections/04-acknowledgments.qmd >}} diff --git a/references.bib b/references.bib index 51e5bb2..b68661c 100644 --- a/references.bib +++ b/references.bib @@ -1,3 +1,125 @@ + + +@book{Mons2018DataStewardshipBook, + address = {Milton}, + author = {Mons, Barend}, + date-added = {2024-06-17 11:30:13 -0700}, + date-modified = {2024-06-17 11:30:13 -0700}, + doi = {10.1201/9781315380711}, + edition = {1}, + id = {cdi{\_}askewsholts{\_}vlebooks{\_}9781315351148}, + isbn = {9780815348184}, + keywords = {big data ; Bioinformatics ; Business enterprises ; COMPUTERSCIENCEnetBASE ; data curation ; data formatting ; data integration ; Data Preparation \& Mining ; Data protection ; data publishing ; Database management ; FAIR data ; Information resources management ; Information technology ; INFORMATIONSCIENCEnetBASE ; SCI-TECHnetBASE ; Statistical Computing ; STATSnetBASE ; STMnetBASE}, + n2 = {Data Stewardship for Open Science: Implementing FAIR Principles has been written with the intention of making scientists, funders, and innovators in all disciplines and stages of their professional activities broadly aware of the need, complexity, and challenges associated with open science, modern science communication, and data stewardship. The FAIR principles are used as a guide throughout the text, and this book should leave experimentalists consciously incompetent about data stewardship and motivated to respect data stewards as representatives of a new profession, while possibly motivating others to consider a career in the field. The ebook, avalable for no additional cost when you buy the paperback, will be updated every 6 months on average (providing that significant updates are needed or avaialble). Readers will have the opportunity to contribute material towards these updates, and to develop their own data management plans, via the free Data Stewardship Wizard .}, + publisher = {CRC Press}, + title = {Data Stewardship for Open Science: Implementing FAIR Principles}, + volume = {1}, + year = {2018}, + bdsk-url-1 = {https://doi.org/10.1201/9781315380711}} + + +@MISC{Koch2012-ve, + title = "Observatories of the mind", + booktitle = "Nature Publishing Group {UK}", + author = "Koch, Christof and Clay Reid, R", + abstract = "An ambitious project to map the mouse brain at the Allen + Institute for Brain Science is a huge undertaking that may + unify neuroscience, argue Christof Koch and R. Clay Reid.", + month = mar, + year = 2012, + howpublished = "\url{http://dx.doi.org/10.1038/483397a}", + note = "Accessed: 2024-6-17", + language = "en" +} + + + +@ARTICLE{Basaglia2023-dq, + title = "Data preservation in high energy physics", + author = "Basaglia, T and Bellis, M and Blomer, J and Boyd, J and Bozzi, C + and Britzger, D and Campana, S and Cartaro, C and Chen, G and + Couturier, B and David, G and Diaconu, C and Dobrin, A and + Duellmann, D and Ebert, M and Elmer, P and Fernandes, J and + Fields, L and Fokianos, P and Ganis, G and Geiser, A and Gheata, + M and Lopez, J B Gonzalez and Hara, T and Heinrich, L and + Hildreth, M and Herner, K and Jayatilaka, B and Kado, M and + Keeble, O and Kohls, A and Naim, K and Lange, C and + Lassila-Perini, K and Levonian, S and Maggi, M and Marshall, Z + and Vila, P Mato and Me{\v c}ionis, A and Morris, A and Piano, S + and Potekhin, M and Schr{\"o}der, M and Schwickerath, U and + Sexton-Kennedy, E and {\v S}imko, T and Smith, T and South, D and + Verbytskyi, A and Vidal, M and Vivace, A and Wang, L and Watt, G + and Wenaus, T and {DPHEP Collaboration}", + abstract = "Data preservation is a mandatory specification for any present + and future experimental facility and it is a cost-effective way + of doing fundamental research by exploiting unique data sets in + the light of the continuously increasing theoretical + understanding. This document summarizes the status of data + preservation in high energy physics. The paradigms and the + methodological advances are discussed from a perspective of more + than ten years of experience with a structured effort at + international level. The status and the scientific return related + to the preservation of data accumulated at large collider + experiments are presented, together with an account of ongoing + efforts to ensure long-term analysis capabilities for ongoing and + future experiments. Transverse projects aimed at generic + solutions, most of which are specifically inspired by open + science and FAIR principles, are presented as well. A prospective + and an action plan are also indicated.", + journal = "The European Physical Journal C", + volume = 83, + number = 9, + pages = "795", + month = sep, + year = 2023 +} + + + + +@inproceedings{wells1979fits, + title={FITS-a flexible image transport system}, + author={Wells, Donald Carson and Greisen, Eric W}, + booktitle={Image processing in astronomy}, + pages={445}, + year={1979} +} + +@ARTICLE{Rubel2022NWB, + title = "The Neurodata Without Borders ecosystem for neurophysiological + data science", + author = "R{\"u}bel, Oliver and Tritt, Andrew and Ly, Ryan and Dichter, + Benjamin K and Ghosh, Satrajit and Niu, Lawrence and Baker, + Pamela and Soltesz, Ivan and Ng, Lydia and Svoboda, Karel and + Frank, Loren and Bouchard, Kristofer E", + abstract = "The neurophysiology of cells and tissues are monitored + electrophysiologically and optically in diverse experiments and + species, ranging from flies to humans. Understanding the brain + requires integration of data across this diversity, and thus + these data must be findable, accessible, interoperable, and + reusable (FAIR). This requires a standard language for data and + metadata that can coevolve with neuroscience. We describe design + and implementation principles for a language for neurophysiology + data. Our open-source software (Neurodata Without Borders, NWB) + defines and modularizes the interdependent, yet separable, + components of a data language. We demonstrate NWB's impact + through unified description of neurophysiology data across + diverse modalities and species. NWB exists in an ecosystem, which + includes data management, analysis, visualization, and archive + tools. Thus, the NWB data language enables reproduction, + interchange, and reuse of diverse neurophysiology data. More + broadly, the design principles of NWB are generally applicable to + enhance discovery across biology through data FAIRness.", + journal = "Elife", + volume = 11, + month = oct, + year = 2022, + keywords = "FAIR data; Neurophysiology; archive; data ecosystem; data + language; data standard; human; mouse; neuroscience; rat", + language = "en" +} + + @ARTICLE{Gorgolewski2016BIDS, title = "The {Brain} {Imaging} {Data} {Structure}, a format for organizing and describing outputs of neuroimaging experiments", diff --git a/sections/01-introduction.qmd b/sections/01-introduction.qmd index da89585..de2fd6e 100644 --- a/sections/01-introduction.qmd +++ b/sections/01-introduction.qmd @@ -1,104 +1,45 @@ # Introduction Data-intensive discovery has become an important mode of knowledge production -across many research fields and has had a significant and broad impact across -all of society. This is becoming increasingly salient as recent developments in -machine learning and artificial intelligence (AI) promise to increase the value -of large, multi-dimensional, heterogeneous data sources. Coupled with these new -machine learning techniques, these datasets can help us understand everything -from the cellular operations of the human body, through business transactions -on the internet, to the structure and history of the universe. However, the -development of new machine learning methods, and data-intensive discovery more -generally, rely heavily on the availability and usability of these large -datasets. Data can be openly available but still not useful if it cannot be -properly understood. In current conditions in which almost all of the relevant -data is stored in digital formats, and many relevant datasets can be found -through the communication networks of the world wide web, Findability, -Accessibility, Interoperability and Reusability (FAIR) principles for data -management and stewardship become critically important -\cite{Wilkinson2016FAIR}. - -One of the main mechanisms through which these principles are promoted is the -development of \emph{standards} for data and metadata. Standards can vary in -the level of detail and scope, and encompass such things as \emph{file formats} -for the storing of certain data types, \emph{schemas} for databases that store -a range of data types, \emph{ontologies} to describe and organize metadata in a -manner that connects it to field-specific meaning, as well as mechanisms to -describe \emph{provenance} of different data derivatives. The importance of -standards was underscored in a recent report report by the Subcommittee on Open -Science of the National Science and Technology Council on "Desirable -characteristics of data repositories for federally funded research" -\cite{nstc2022desirable}. The report explicitly called out the importance of -"allow[ing] datasets and metadata to be accessed, downloaded, or exported from -the repository in widely used, preferably non-proprietary, formats consistent -with standards used in the disciplines the repository serves." This highlights -the need for data and metadata standards across a variety of different kinds of -data. In addition, a report from the National Institute of Standards and -Technology on "U.S. Leadership in AI: A Plan for Federal Engagement in -Developing Technical Standards and Related Tools" emphasized that -- -specifically for the case of AI -- "U.S. government agencies should prioritize -AI standards efforts that are [...] Consensus-based, [...] Inclusive and -accessible, [...] Multi-path, [...] Open and transparent, [...] and [that] -Result in globally relevant and non-discriminatory standards..." -\cite{NIST2019}. The converging characteristics of standards that arise from -these reports suggest that considerable thought needs to be given to the manner -in which standards arise, so that these goals are achieved. - -Standards for a specific domain can come about in various ways, but very -broadly speaking two kinds of mechanisms can generate a standard for a specific -type of data: (i) top-down: in this case a (usually) small group of people -develop the standard and disseminate it to the communities of interest with -very little input from these communities. An example of this mode of standards -development can occur when an instrument is developed by a manufacturer and -users of this instrument receive the data in a particular format that was -developed in tandem with the instrument; and (ii) bottom-up: in this case, -standards are developed by a larger group of people that convene and reach -consensus about the details of the standard in an attempt to cover a large -range of use-cases. Most standards are developed through an interplay between -these two modes, and understanding how to make the best of these modes is -critical in advancing the development of data and metadata standards. - -One source of inspiration for bottom-up development of robust, adaptable and -useful standards comes from open-source software (OSS). OSS has a long history +across many research fields and it is having a significant and broad impact +across all of society. This is becoming increasingly salient as recent +developments in machine learning and artificial intelligence (AI) promise to +increase the value of large, multi-dimensional, heterogeneous data sources. +Coupled with these new machine learning techniques, these datasets can help us +understand everything from the cellular operations of the human body, through +business transactions on the internet, to the structure and history of the +universe. However, the development of new machine learning methods and +data-intensive discovery more generally depend on Findability, Accessibility, +Interoperability and Reusability (FAIR) of data [@Wilkinson2016FAIR]. One of +the main mechanisms through which the FAIR principles are promoted is the +development of *standards* for data and metadata. Standards can vary in the +level of detail and scope, and encompass such things as *file formats* for the +storage of certain data types, *schemas* for databases that organize data, +*ontologies* to describe and organize metadata in a manner that connects it to +field-specific meaning, as well as mechanisms to describe *provenance* of +analysis products. + +Community-driven development of robust, adaptable and useful standards draws +significant inspiration from the development of open-source software (OSS) and +has many parallels and overlaps with OSS development. OSS has a long history going back to the development of the Unix operating system in the late 1960s. Over the time since its inception, the large community of developers and users -of OSS have have developed a host of socio-technical mechanisms that support -the development and use of OSS. For example, the Open Source Initiative (OSI), -a non-profit organization that was founded in 1990s has evolved a set of +of OSS have developed a host of socio-technical mechanisms that support the +development and use of OSS. For example, the Open Source Initiative (OSI), a +non-profit organization that was founded in the 1990s developed a set of guidelines for licensing of OSS that is designed to protect the rights of -developers and users. Technical tools to support the evolution of open-source -software include software for distributed version control, such as the Git -Source-code management system. When these social and technical innovations are -put together they enable a host of positive defining features of OSS, such as -transparency, collaboration, and decentralization. These features allow OSS to -have a remarkable level of dynamism and productivity, while also retaining the -ability of a variety of stakeholders to guide the evolution of the software to -take their needs and interests into account. - -A necessary complement to these technical tools and legal instruments have been -a host of practices that define the social interactions \emph{within} -communities of OSS developers and users, and structures for governing these -communities. While many OSS communities started as projects led by individual -founders (so-called benevolent dictators for life, or BDFL; a title first -bestowed on the originator of the Python programming language, Guido Van Rossum -\cite{Van_Rossum2008BDFL}), recent years have led to an increased understanding -that minimal standards of democratic governance are required in order for OSS -communities to develop and flourish. This has led to the adoption of codes of -conduct that govern the standards of behavior and communication among project -stakeholders. It has also led to the establishment of democratically elected -steering councils/committees from among the members and stakeholders of an OSS -project's community. - -It was also within the Python community that an orderly process for -community-guided evolution of an open-source software project emerged, through -the Python Enhancement Proposal (PEP) mechanism \cite{Warsaw2000PEP1}, which -lays out how major changes to the software should be proposed, advocated for, -and eventually decided on. While these tools, ideas, and practices evolved in -developing software, they are readily translated to other domains. For example, -OSS notions surrounding IP have given rise to the Creative Commons movement -that has expanded these notions to apply to a much wider range of human -creative endeavours. Similarly OSS notions regarding collaborative structures -have pervaded the current era of open science and team science -\cite{Baumgartner2023TeamScience, Koch2016TeamScience}. - +developers and users. On the more technical side, tools such as the Git +Source-code management system support open-source development workflows that +can be adopted in the development of standards. When these social and technical +innovations are put together they enable a host of positive defining features +of OSS, such as transparency, collaboration, and decentralization. These +features allow OSS to have a remarkable level of dynamism and productivity, +while also retaining the ability of a variety of stakeholders to guide the +evolution of the software to take their needs and interests into account. The +present report seeks to explore how OSS processes and tools have affected the +development of data and metadata standards. The report will triangulate common +features of a variety of use-cases, will identify some of the challenges and +pitfalls of this mode of standards development, and will make recommendations +for future developments and policies that can help this mode of standards +development thrive and reach its full potential. diff --git a/sections/02-challenges.qmd b/sections/02-challenges.qmd index c47d619..c04c7b8 100644 --- a/sections/02-challenges.qmd +++ b/sections/02-challenges.qmd @@ -22,17 +22,17 @@ about the practical implications of changes to the standards. ## Unclear pathways for standards success -Standards typically develop organically through sustained and persistent efforts from dedicated -groups of data practitioneers. These include scientists and the broader ecosystem of data curators and users. However there is no playbook on the structure and components of a data standard, or the pathway that moves a data implementation to a data standard. -As a result, data standardization lacks formal avenues for research grants. +Standards typically develop organically through sustained and persistent efforts from dedicated +groups of data practitioneers. These include scientists and the broader ecosystem of data curators and users. However there is no playbook on the structure and components of a data standard, or the pathway that moves a data implementation to a data standard. +As a result, data standardization lacks formal avenues for research grants. ## Cross domain funding gaps -Data standardization investment is justified if the standard is generalizable beyond any specific science domain. However while the use cases are domain sciences based, data standardization is seen as a data infrastrucutre and not a science investment. Moreover due to how science research funding works, scientists lack incentives to work across domains, or work on infrastructure problems. +Data standardization investment is justified if the standard is generalizable beyond any specific science domain. However while the use cases are domain sciences based, data standardization is seen as a data infrastrucutre and not a science investment. Moreover due to how science research funding works, scientists lack incentives to work across domains, or work on infrastructure problems. -## Data instrumentation issues +## Data instrumentation issues -Data for scientific observations are often generated by proprietary instrumentation due to commercialization or other profit driven incentives. There islack of regulatory oversight to adhere to available standards or evolve Significant data transformation is required to get data to a state that is amenable to standards, if available. If not available, there is lack of incentive to set aside investment or resources to invest in establishing data standards. +Data for scientific observations are often generated by proprietary instrumentation due to commercialization or other profit driven incentives. There islack of regulatory oversight to adhere to available standards or evolve Significant data transformation is required to get data to a state that is amenable to standards, if available. If not available, there is lack of incentive to set aside investment or resources to invest in establishing data standards. ## Sustainability diff --git a/sections/03-recommendations.qmd b/sections/03-recommendations.qmd index c3acb32..a3441c2 100644 --- a/sections/03-recommendations.qmd +++ b/sections/03-recommendations.qmd @@ -1,28 +1,56 @@ -# Recommendations for open source data and metadata standards +# Recommendations for open-source data and metadata standards -We propose the following recommendations: +In conclusion of this report, we propose the following recommendations: -## Funding or Grantmaking entities: +## Funding or Grantmaking entities: ### Fund Data Standards Development -Data standards development should be seen integral to science innovation and earmarked for funding in research grants. Funding models should encourage the development and adoption of standards, and fund associated community efforts and tools for this. -### Invest in Data Stewards -Recognize data stewards as a distinct role in research and science investment. Set up programs for training for data stewards and invest in career paths that encourage this role. +While some funding agencies already support standards development as part of +the development of informatics infrastructures, data standards development +should be seen as integral to science innovation and earmarked for funding in +research grants, not only in specialized contexts. Funding models should +encourage the development and adoption of standards, and fund associated +community efforts and tools for this. The OSS model is seen as a particularly +promising avenue for an investment of resources, because it builds on +previously-developed procedures and technical infrastructure and because it +provides avenues for community input along the way. The clarity offered by +procedures for enhancement proposals and semantic versioning schemes adopted in +standards development offer avenues for a range of stakeholders to propose to +funding bodies well-defined contributions to large and field-wide standards +efforts. + +### Invest in Data Stewards Recognize data stewards as a distinct role in +research and science investment. Set up programs for training for data stewards +and invest in career paths that encourage this role. Initial proposals for the +curriculum and scope of the role have already been proposed (e.g., in +[@Mons2018DataStewardshipBook]) ### Review Data Standards Pathways -Invest in programs that examine retrospective pathways for establishing data standards. Encourage publication of lifecycles for successful data standards. Lifecycle should include process, creators, affiliations, grants, and adoption journeys. Make this documentation step integral to the work of standards creators and granting agencies. Retrocactively document #3 for standards such as CF(climate science), NASA genelab (space omics), OpenGIS (geospatial), DICOM (medical imaging), GA4GH (genomics), FITS (astronomy), Zarr (domain agnostic n-dimensional arrays)... ? +Invest in programs that examine retrospective pathways for establishing data +standards. Encourage publication of lifecycles for successful data standards. +Lifecycle should include process, creators, affiliations, grants, and adoption +journeys. Make this documentation step integral to the work of standards +creators and granting agencies. Retrocactively document #3 for standards such +as CF(climate science), NASA genelab (space omics), OpenGIS (geospatial), DICOM +(medical imaging), GA4GH (genomics), FITS (astronomy), Zarr (domain agnostic +n-dimensional arrays)... ? -### Establish Governance +### Establish Governance -Establish governance for standards creation and adoption, especially for communities beyond a certain size that need to converge toward a new standard or rely on an existing standard. Review existing governance practices such as [TheOpenSourceWay](https://www.theopensourceway.org/the_open_source_way-guidebook-2.0.html#_project_and_community_governance). Data management plans should promote the sharing of not only data, but also metadata and descriptions of how to use it. +Establish governance for standards creation and adoption, especially for +communities beyond a certain size that need to converge toward a new standard +or rely on an existing standard. Review existing governance practices such as +[TheOpenSourceWay](https://www.theopensourceway.org/the_open_source_way-guidebook-2.0.html#_project_and_community_governance). +Data management plans should promote the sharing of not only data, but also +metadata and descriptions of how to use it. ### Program Manage Cross Sector alliances -Encourage cross sector and cross domain alliances that can impact successful standards creation. Invest in robust program management of these alliances to align pace and create incentives (for instance via Open Source Program Office / OSPO efforts). Similar to program officers at funding agencies, standards evolution need sustained PM efforts. Multi company partnerships should include strategic initiatives for standard establishment e.g. [Pistoiaalliance](https://www.pistoiaalliance.org/news/press-release-pistoia-alliance-launches-idmp-1-0/). +Encourage cross sector and cross domain alliances that can impact successful standards creation. Invest in robust program management of these alliances to align pace and create incentives (for instance via Open Source Program Office / OSPO efforts). Similar to program officers at funding agencies, standards evolution need sustained PM efforts. Multi company partnerships should include strategic initiatives for standard establishment e.g. [Pistoiaalliance](https://www.pistoiaalliance.org/news/press-release-pistoia-alliance-launches-idmp-1-0/). @@ -33,22 +61,22 @@ Stakeholder organizations should invest in training grants to establish curricul ## Science and Technology Communities: -### User Driven Development +### User Driven Development -Standards should be needs-driven and developed in close collaboration with users. Changes and enhancements should be in response to community feedback. +Standards should be needs-driven and developed in close collaboration with users. Changes and enhancements should be in response to community feedback. ### Meta-Standards development -Develop meta-standards or standards-of-standards. These are descriptions of cross-cutting best-practices and can be used as a basis of the analysis or assessment of an existing standard, or as guidelines to develop new standards. For instance, barriers to adopting a data standard irrespective of team size and technological capabilities should be considered. Meta standards should include formalization for versioning of standards & interaction with related software. Naming of standards should aid marketing and adoption. +Develop meta-standards or standards-of-standards. These are descriptions of cross-cutting best-practices and can be used as a basis of the analysis or assessment of an existing standard, or as guidelines to develop new standards. For instance, barriers to adopting a data standard irrespective of team size and technological capabilities should be considered. Meta standards should include formalization for versioning of standards & interaction with related software. Naming of standards should aid marketing and adoption. ### Ontology Development -Create ontology for standards process such as top down vs bottom up, minimum number of datasets, community size. Examine schema.org (w3c), PEP (Python), CDISC (FDA). +Create ontology for standards process such as top down vs bottom up, minimum number of datasets, community size. Examine schema.org (w3c), PEP (Python), CDISC (FDA). ### Formalization Guidelines -Amplify formalization/guidelines on how to create standards (example metadata schema specifications using [LinkML](https://linkml.io). +Amplify formalization/guidelines on how to create standards (example metadata schema specifications using [LinkML](https://linkml.io). ### Landscape and Failure Analysis @@ -56,8 +84,8 @@ Before establishing a new standard, survey and document failure of current stand ### Machine Readability -Development of standards should be coupled with development of associated software. Make data standards machine readable, and software creation an integral part of establishing a standard's schema e.g. For identifiers for a person using CFF in citations, cffconvert software makes the CFF standard usable and useful. -Additionally, standards evolution should maintain software compatibility, and ability to translate and migrate between standards. +Development of standards should be coupled with development of associated software. Make data standards machine readable, and software creation an integral part of establishing a standard's schema e.g. For identifiers for a person using CFF in citations, cffconvert software makes the CFF standard usable and useful. +Additionally, standards evolution should maintain software compatibility, and ability to translate and migrate between standards. diff --git a/sections/04-acknowledgments.qmd b/sections/04-acknowledgments.qmd new file mode 100644 index 0000000..acb4fc3 --- /dev/null +++ b/sections/04-acknowledgments.qmd @@ -0,0 +1,12 @@ +# Acknowledgements + +This report was produced following a +[workshop held at NSF headquarters in Alexandria, VA on April 8th-9th, 2024](https://uwescience.github.io/2024-open-source-standards-workshop/). +We would like to thank the speakers and participants in this workshop for the +time and thought that they put into the workshop. + +The workshop and this report were funded through [NSF grant +#2334483](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2334483&HistoricalAwards=false) +from the NSF [Pathways to Enable Open-Source Ecosystems +(POSE)](https://new.nsf.gov/funding/opportunities/pathways-enable-open-source-ecosystems-pose) +program. \ No newline at end of file diff --git a/sections/xx-cross-sector.qmd b/sections/xx-cross-sector.qmd new file mode 100644 index 0000000..4978d0b --- /dev/null +++ b/sections/xx-cross-sector.qmd @@ -0,0 +1,35 @@ +# Cross-sector interactions + +The importance of standards stems not only from discussions within research +fields about how research can best be conducted to take advantage of existing +and growing datasets, but also arises from interactions with other sectors. + +For example, an ongoing series of policy discussions that address the +interactions between research communities and the general public. In the United +States, these policies are expressed, for example, in memos issued by the +directors of the White House Office of Science and Technology Policy (OSTP), +James Holdren (in 2013) and Alondra Nelson (in 2022). While these memos focused +primarily on making peer-reviewed publications funded by the US Federal +government available to the general public, they also lay an increasingly +detailed path toward the publication and general availability of the data that +is collected in research that is funded by the US government. The general +guidance and overall spirit of these memos dovetail with more specific policy +guidance related to data and metadata standards. For example, the importance of +standards was underscored in a recent report by the Subcommittee on Open +Science of the National Science and Technology Council on the "Desirable +characteristics of data repositories for federally funded research" +[@nstc2022desirable]. The report explicitly called out the importance of +"allow[ing] datasets and metadata to be accessed, downloaded, or exported from +the repository in widely used, preferably non-proprietary, formats consistent +with standards used in the disciplines the repository serves." This highlights +the need for data and metadata standards across a variety of different kinds of +data. In addition, a report from the National Institute of Standards and +Technology on "U.S. Leadership in AI: A Plan for Federal Engagement in +Developing Technical Standards and Related Tools" emphasized that -- +specifically for the case of AI -- "U.S. government agencies should prioritize +AI standards efforts that are [...] Consensus-based, [...] Inclusive and +accessible, [...] Multi-path, [...] Open and transparent, [...] and [that] +result in globally relevant and non-discriminatory standards..." [@NIST2019]. +The converging characteristics of standards that arise from these reports +suggest that considerable thought needs to be given to how standards arise so +that these goals are achieved. diff --git a/sections/xx-use-cases.qmd b/sections/xx-use-cases.qmd new file mode 100644 index 0000000..5ae9d5d --- /dev/null +++ b/sections/xx-use-cases.qmd @@ -0,0 +1,72 @@ +# Use cases + +To understand how OSS development practices affect the development of data and +metadata standards, it is informative to demonstrate this cross-fertilization +through a few use cases. As we will see in these examples some fields, such as +astronomy, high-energy physics and earth sciences have a relatively long +history of shared data resources from organizations such as LSST and CERN, +while other fields have only relatively recently become aware of the value of +data sharing and its impact. These disparate histories inform how standards +have evolved and how OSS practices have pervaded their +development. + +## Astronomy + +One prominent example of a community-driven standard is the FITS (Flexible +Image Transport System) file format standard, which was developed in the late +1970s and early 1980s [@wells1979fits], and has been adopted worldwide for +astronomy data preservation and exchange. Essentially every software platform +used in astronomy reads and writes the FITS format. It was developed by +observatories in the 1980s to store image data in the visible and x-ray +spectrum. It has been endorsed by IAU, as well as funding agencies. Though the +format has evolved over time, “once FITS, always FITS”. That is, the format +cannot be evolved to introduce changes that break backwards-compatibility. +Among the features that make FITS so durable is that it was designed originally +to have a very restricted metadata schema. That is, FITS records were designed +to be the lowest common denominator of word lengths in computer systems at the +time. However, while FITS is compact, its ability to encode the coordinate +frame and pixels, means that data from different observational instruments can +be stored in this format and relationships between data from different +instruments can be related, rendering manual and error-prone procedures for +conforming images obsolete. + +## High-energy physics + +In HEP standards to collect the data have been established and the community is +fairly homogeneous, so standards have very high penetration [@Basaglia2023-dq]. +A top-down approach is taken so that within every large collaboration standards +are enforced, and this adoption is centrally managed. Access to raw data is +essentially impossible, and making it publicly available is both technically +very hard and potentially ill-advised. Analysis tools are tuned specifically to +the standards. Incentives to use the standards are provided by funders that +require the data management plan that specifies how the data is shared. + + +## Neuroscience + +In contrast to astronomy and HEP, Neuroscience has traditionally been a +"cottage industry", where individual labs have generated experimental data +designed to answer specific experimental questions. While this model still +exists, the field has also seen the emergence of new modes of data production +that focus on generating large shared datasets designed to answer many +different questions, more akin to the data generated in large astronomy data +collection efforts [@Koch2012-ve]. This change has been brought on through a +combination of technical advances in data acquisition techniques, which now +generate large and very high-dimensional/information-rich datasets, cultural +changes, which have ushered in new norms of transparency and reproducibility, +and funding initiatives that have encouraged this kind of data collection +(including the US BRAIN Initiative and the Allen Institute for Brain Science). +Neuroscience presents an interesting example because these changes are +relatively recent. This means that standards for data and metadata in +neuroscience have been prone to adopt many of the elements of OSS development. +Two salient examples in neuroscience are the Neurodata Without Borders file +format for neurophysiology data [@Rubel2022NWB] and the Brain Imaging Data +Structure standard for neuroimaging data [@Gorgolewski2016BIDS]. The latter in +particular has adopted a + + + +## Automated discovery + +## Citizen science +