diff --git a/references.bib b/references.bib index 6f3a3ac..6a222a3 100644 --- a/references.bib +++ b/references.bib @@ -1,3 +1,126 @@ +@ARTICLE{Hanisch2015-cu, + title = "The Virtual Astronomical Observatory: Re-engineering access to + astronomical data", + author = "Hanisch, R J and Berriman, G B and Lazio, T J W and Emery Bunn, S + and Evans, J and McGlynn, T A and Plante, R", + journal = "Astron. Comput.", + publisher = "Elsevier BV", + volume = 11, + pages = "190--209", + abstract = "The US Virtual Astronomical Observatory was a software + infrastructure and development project designed both to begin the + establishment of an operational Virtual Observatory (VO) and to + provide the US coordination with the international VO effort. The + concept of the VO is to provide the means by which an astronomer + is able to discover, access, and process data seamlessly, + regardless of its physical location. This paper describes the + origins of the VAO, including the predecessor efforts within the + US National Virtual Observatory, and summarizes its main + accomplishments. These accomplishments include the development of + both scripting toolkits that allow scientists to incorporate VO + data directly into their reduction and analysis environments and + high-level science applications for data discovery, integration, + analysis, and catalog cross-comparison. Working with the + international community, and based on the experience from the + software development, the VAO was a major contributor to + international standards within the International Virtual + Observatory Alliance. The VAO also demonstrated how an + operational virtual observatory could be deployed, providing a + robust operational environment in which VO services worldwide + were routinely checked for aliveness and compliance with + international standards. Finally, the VAO engaged in community + outreach, developing a comprehensive web site with on-line + tutorials, announcements, links to both US and internationally + developed tools and services, and exhibits and hands-on training + at annual meetings of the American Astronomical Society and + through summer schools and community days. All digital products + of the VAO Project, including software, documentation, and + tutorials, are stored in a repository for community access. The + enduring legacy of the VAO is an increasing expectation that new + telescopes and facilities incorporate VO capabilities during the + design of their data management systems.", + month = jun, + year = 2015, + language = "en" +} + +@ARTICLE{Larobina2023-vq, + title = "Thirty years of the {DICOM} standard", + author = "Larobina, Michele", + journal = "Tomography", + publisher = "mdpi.com", + volume = 9, + number = 5, + pages = "1829--1838", + abstract = "Digital Imaging and Communications in Medicine (DICOM) is an + international standard that defines a format for storing medical + images and a protocol to enable and facilitate data communication + among medical imaging systems. The DICOM standard has been + instrumental in transforming the medical imaging world over the + last three decades. Its adoption has been a significant + experience for manufacturers, healthcare users, and research + scientists. In this review, thirty years after introducing the + standard, we discuss the innovation, advantages, and limitations + of adopting the DICOM and its possible future directions.", + month = oct, + year = 2023, + keywords = "DICOM; communication protocols; file formats; metadata; + quantitative imaging", + language = "en" +} + +@INPROCEEDINGS{Mustra2008-xk, + title = "Overview of the {DICOM} standard", + author = "Mustra, Mario and Delac, Kresimir and Grgic, Mislav", + booktitle = "2008 50th International Symposium ELMAR", + publisher = "IEEE", + volume = 1, + pages = "39--44", + abstract = "Digital technology has in the last few decades entered almost + every aspect of medicine. There has been a huge development in + noninvasive medical imaging equipment. Because there are many + medical equipment manufacturers, a standard for storage and + exchange of medical images needed to be developed. DICOM (Digital + Imaging and Communication in Medicine) makes medical image + exchange more easy and independent of the imaging equipment + manufacturer. Besides the image data, DICOM file format supports + other information useful to describe the image. This makes DICOM + easy to use and the data exchange fast and safe while avoiding + possible confusion caused by multiple files for the same study.", + month = sep, + year = 2008 +} + + +@ARTICLE{Scroggins2020-ut, + title = "Once {FITS}, Always {FITS}? Astronomical Infrastructure in + Transition", + author = "Scroggins, Michael and Boscoe, Bernadette M", + journal = "IEEE Ann. Hist. Comput.", + publisher = "IEEE", + volume = 42, + number = 2, + pages = "42--54", + abstract = "The flexible interchange transport system (FITS) file format has + become the de facto standard for sharing, analyzing, and + archiving astronomical data over the last four decades. FITS was + adopted by astronomers in the early 1980s to overcome + incompatibilities between operating systems. On the back of FITS’ + success, astronomical data became both backward compatible and + easily shareable. However, new advances in the astronomical + instrumentation, computational technologies, and analytic + techniques have resulted in new data that do not work well within + the traditional FITS format. Tensions have arisen between the + desire to update the format to meet new analytic challenges and + adherence to the original edict for the FITS file format to be + backward compatible. We examine three inflection points in the + governance of FITS: first, initial development and success, + second, widespread acceptance and governance by the working + group, and third, the challenges to FITS in a new era of + increasing data and computational complexity within astronomy.", + year = 2020 +} + @ARTICLE{Musen2022metadata, title = "Without appropriate metadata, data-sharing mandates are diff --git a/sections/02-use-cases.qmd b/sections/02-use-cases.qmd index e1213a4..d699199 100644 --- a/sections/02-use-cases.qmd +++ b/sections/02-use-cases.qmd @@ -20,17 +20,34 @@ Image Transport System) file format standard, which was developed in the late astronomy data preservation and exchange. Essentially every software platform used in astronomy reads and writes the FITS format. It was developed by observatories in the 1980s to store image data in the visible and x-ray -spectrum. It has been endorsed by IAU, as well as funding agencies. Though the -format has evolved over time, “once FITS, always FITS”. That is, the format -cannot be evolved to introduce changes that break backward compatibility. -Among the features that make FITS so durable is that it was designed originally -to have a very restricted metadata schema. That is, FITS records were designed -to be the lowest common denominator of word lengths in computer systems at the -time. However, while FITS is compact, its ability to encode the coordinate -frame and pixels, means that data from different observational instruments can -be stored in this format and relationships between data from different -instruments can be related, rendering manual and error-prone procedures for -conforming images obsolete. +spectrum. It has been endorsed by the International Astronomical Union (IAU), +as well as funding agencies. Though the format has evolved over time, “once +FITS, always FITS”. That is, the format cannot be evolved to introduce changes +that break backward compatibility. Among the features that make FITS so durable +is that it was designed originally to have a very restricted metadata schema. +That is, FITS records were designed to be the lowest common denominator of word +lengths in computer systems at the time. However, while FITS is compact, its +ability to encode the coordinate frame and pixels, means that data from +different observational instruments can be stored in this format and +relationships between data from different instruments can be related, rendering +manual and error-prone procedures for conforming images obsolete. Nevertheless, +the stability has also raised some issues as the field continues to adapt to +new measurement methods and the demands of ever-increasing data volumes and +complex data analysis use-case, such as interchange with other data and the use +of complex data bases to store and share data [@Scroggins2020-ut]. Another +prominent example of the use of open-source processes to develop standards in +Astronomy is in the tools and protocols developed by the International Virtual +Observatory Alliance (IVOA) and its national implementations, e.g., in the US +Virtual Astronomical Observatory[@Hanisch2015-cu]. The virtual observatories +facilitate discovery and access across observatories around the world and +underpin data discovery in astronomy. The IVOA took inspiration from the +World-Wide Web Consortium (W3C) and adopted its process for the development of +its standards (i.e., Working drafts $\rightarrow$ Proposed Recommendations +$\rightarrow$ Recommendations), with individual standards developed by +inter-institutional and international working groups. One of the outcomes of +the coordination effort is the development of an ecosystem of software tools +both developed within the observatory teams and within the user community that +interoperate with the standards that were adopted by the observatories. ## High-energy physics (HEP) @@ -47,13 +64,38 @@ data is shared (i.e., in a standards-compliant manner). ## Earth sciences -The need for geospatial data exchange between different systems began to be recognized in the 1970s and 1980s, but proprietary formats still dominated. Coordinated standardization efforts brought the Open Geospatial Consortium (OGC) establishment in the 1990s, a critical step towards open standards for geospatial data. The 1990s have also seen the development of key standards such as the Network Common Data Form (NetCDF) developed by the University Corporation for Atmospheric Research (UCAR) and the Hierarchical Data Format (HDF), a set of file formats (HDF4, HDF5) that are widely used, particularly in climate research. The GeoTIFF format, which originated at NASA in the late 1990s, is extensively used to share image data. In the 1990s, open web mapping also began with MapServer (https://mapserver.org) and continued later with other projects such as OpenStreetMap (www.openstreetmap.org). The following two decades, the 2000s-2020s, brought an expansion of open standards and integration with web technologies developed by OGC, as well as other standards such as the Keyhole Markup Language (KML) for displaying geographic data in Earth browsers. Formats suitable for cloud computing also emerged, such as the Cloud Optimized GeoTIFF (COG), followed by Zarr and Apache Parquet for array and tabular data, respectively. In 2006, the Open Source Geospatial Foundation (OSGeo, https://www.osgeo.org) was established, demonstrating the community's commitment to the development of open-source geospatial technologies. While some standards have been developed in the industry (e.g., Keyhole Markup Language (KML) by Keyhole Inc., which Google later acquired), they later became international standards of the OGC, which now encompasses more than 450 commercial, governmental, nonprofit, and research organizations working together on the development and implementation of open standards (https://www.ogc.org). +The need for geospatial data exchange between different systems began to be +recognized in the 1970s and 1980s, but proprietary formats still dominated. +Coordinated standardization efforts brought the Open Geospatial Consortium +(OGC) establishment in the 1990s, a critical step towards open standards for +geospatial data. The 1990s have also seen the development of key standards such +as the Network Common Data Form (NetCDF) developed by the University +Corporation for Atmospheric Research (UCAR), and the Hierarchical Data Format +(HDF), a set of file formats (HDF4, HDF5) that are widely used, particularly in +climate research. The GeoTIFF format, which originated at NASA in the late +1990s, is extensively used to share image data. In the 1990s, open web mapping +also began with MapServer (https://mapserver.org) and continued later with +other projects such as OpenStreetMap (https://www.openstreetmap.org). The +following two decades, the 2000s-2020s, brought an expansion of open standards +and integration with web technologies developed by OGC, as well as other +standards such as the Keyhole Markup Language (KML) for displaying geographic +data in Earth browsers. Formats suitable for cloud computing also emerged, such +as the Cloud Optimized GeoTIFF (COG), followed by Zarr and Apache Parquet for +array and tabular data, respectively. In 2006, the Open Source Geospatial +Foundation (OSGeo, https://www.osgeo.org) was established, demonstrating the +community's commitment to the development of open-source geospatial +technologies. While some standards have been developed in the industry (e.g., +Keyhole Markup Language (KML) by Keyhole Inc., which Google later acquired), +they later became international standards of the OGC, which now encompasses +more than 450 commercial, governmental, nonprofit, and research organizations +working together on the development and implementation of open standards +(https://www.ogc.org). ## Neuroscience -In contrast to astronomy and HEP, Neuroscience has traditionally been a -"cottage industry", where individual labs have generated experimental data -designed to answer specific experimental questions. While this model still +In contrast to the previously-mentioned fields, Neuroscience has traditionally +been a "cottage industry", where individual labs have generated experimental +data designed to answer specific experimental questions. While this model still exists, the field has also seen the emergence of new modes of data production that focus on generating large shared datasets designed to answer many different questions, more akin to the data generated in large astronomy data @@ -72,7 +114,7 @@ success to the adoption of OSS development mechanisms [@Poldrack2024BIDS]. For example, small changes to the standard are managed through the GitHub pull request mechanism; larger changes are managed through a BIDS Enhancement Proposal (BEP) process that is directly inspired by the Python programming -language community's Python Enhancement Proposal procedure, which isused to +language community's Python Enhancement Proposal procedure, which is used to introduce new ideas into the language. Though the BEP mechanism takes a slightly different technical approach, it tries to emulate the open-ended and community-driven aspects of Python development to accept contributions from a @@ -102,3 +144,4 @@ if the standard is developed using git/GitHub for versioning, this would require learning the complex and obscure technical aspects of these system that are far from easy to adopt, even for many professional scientists. + diff --git a/sections/03-challenges.qmd b/sections/03-challenges.qmd index f85ca86..8ddaa32 100644 --- a/sections/03-challenges.qmd +++ b/sections/03-challenges.qmd @@ -31,6 +31,12 @@ community, and migration away from the standard. Similarly, if a standard evolves too rapidly, users may choose to stick to an outdated version of a standard for a long time, creating strains on the community of developers and maintainers of a standard who will need to accommodate long deprecation cycles. +On the other hand, in cases in which some forms of dynamic change is prohibited +-- as in the case of the FITS file format, which prohibits changes that break +backwards-compatibility -- there is also a cost associated with the stability +[@Scroggins2020-ut]: limiting adoption and combinations of new types of +measurements, new analysis methods or new modes of data storage and data +sharing. ## Mismatches between standards developers and user communities @@ -56,6 +62,18 @@ have not yet had significant adoption as tools of day-to-day computational practice. At the same time, it provides clarity and robustness for standards developers communities that are well-versed in these tools. +Another layer of potential mismatches arises when a more complex set of +stakeholders needs to be considered. For example, the Group on Earth +Observations (GEO) is a network that aims to coordinate decision making around +satellite missions and to standardize the data that results from these +missions. Because this group involves a range of different stakeholders, +including individuals who more closely understand potential legal issues and +researchers who are better equipped to evaluate technical and domain questions, +communication is slower and hindered. As the group aims to move forward by +consensus, these communication difficulties can slow down progress. This is +just an example, which exemplifies the many cases in which OSS process which +strives for consensus can slow progress. + ## Cross-domain gaps @@ -146,6 +164,5 @@ grants (and see @sec-cross-sector). This hampers the long-term trajectory that is needed to inculcate a standard into the day-to-day practice of researchers. -## The importance of automated validation diff --git a/sections/04-cross-sector.qmd b/sections/04-cross-sector.qmd index 46122e5..3dc61a7 100644 --- a/sections/04-cross-sector.qmd +++ b/sections/04-cross-sector.qmd @@ -91,9 +91,24 @@ provide specific sources of friction. This is because proprietary/closed formats of data can create difficulty at various transition points: from one instrument vendor to another, from data producer to downstream recipient/user, etc. On the other hand, in some cases, cross-sector collaborations with -commercial entities may pave the way to robust and useful standards. One -example is the DICOM standard, which is maintained by working groups that -encompass commercial imaging device vendors and researchers. +commercial entities may pave the way to robust and useful standards. For +example, imaging measurements in human subjects (e.g., in brain imaging +experiments) significantly interact with standards for medical imaging, and +chiefly the Digital Imaging and Communications in Medicine (DICOM) standard, +which is widely used in a range of medical imaging applications, including in +clinical settings [@Larobina2023-vq, @Mustra2008-xk]. The standard emerged from +the demands of the clinical practice in the 1980s, as digital technologies were +came into widespread use in medical imaging, through joint work of industry +organizations: the American College of Radiology and the National Association +of Electronic Manufacturers. One of the defining features of the DICOM standard +is that it allows manufacturers of instruments to define "private fields" that +are compliant with the standard, but which may include idiosyncratically +organized data and/or metadata. This provides significant flexibility, but can +also easily lead to the loss of important information. Nevertheless, the human +brain imaging case is exemplary of a case in which industry standards and +research standards coexist and need to communicate with each other effectively +to advance research use-cases, while keeping up with the rapid development of +the technologies. diff --git a/sections/05-recommendations.qmd b/sections/05-recommendations.qmd index 009a5ba..75b316c 100644 --- a/sections/05-recommendations.qmd +++ b/sections/05-recommendations.qmd @@ -1,11 +1,87 @@ # Recommendations for open-source data and metadata standards {#sec-recommendations} -In conclusion of this report, we propose the following recommendations: - -## Policy-making and Funding entities: - -### Fund Data Standards Development +In conclusion of this report, we would like to propose a set of recommendations +that distill the lessons learned from an examination of data and metadata +standards through the lense of open-source software development practices. We +divide this section into two parts: one aimed at the science and technology +communities that develop and maintain open-source standards, and the other +aimed at policy-making and funding agencies, who have an interest in fostering +more efficient, more robust, and more transparent open-source standards. + +## Science and technology communities: + +### Establish standards governance based on OSS best practices + +While best-practice governance principles are also relatively new in OSS +communities, there is already a substantial set of prior art in this domain, on +which the developers and maintainers of open-source data and metadata standards +can rely. For example, it is now clear that governance principles and rules can +mitigate some of the risks and challenges mentioned in @sec-challenges, +especially for communities beyond a certain size that need to converge toward a +new standard or rely on an existing standard. Developers and maintainers should +review existing governance practices such as [The Open Source +Way](https://www.theopensourceway.org/the_open_source_way-guidebook-2.0.html#_project_and_community_governance). + + +### Foster meta-standards development + +One of the main conclusions that arise from our survey of the landscape of +existing standards is that there is significant knowledge that exists across +fields and domains and that informs the development of standards within each +field, but that could be surfaced to the level where it may be adopted more +widely in different domains and be more broadly useful. One approach to this is +a comparative approach: In this approach, a readiness and/or maturity model can +be developed that assesses the challenges and opportunities that a specific +standard faces at its current phase of development. Developing such a maturity +model, while it goes beyond the scope of the current report, could lead to the +eventual development of a meta-standard or a standard-of-standards. This would +encompass a succinct description of cross-cutting best-practices that can be +used as a basis for the analysis or assessment of an existing standard, or as +guidelines to develop new standards. For instance, specific barriers to +adopting a data standard that take into account the size of the community and +its specific technological capabilities should be considered. + +More generally, meta-standards could include formalization for versioning of +standards and interactions with specific related software. This includes +amplifying formalization/guidelines on how to create standards (for example, +metadata schema specifications using LinkML (https://linkml.io)). However, +aspects of communication with potential user audiences (e.g., researchers in +particular domains) should be taken into account as well. For example, in the +quality of onboarding documentation and tools for ingestion or conversion into +standards-compliant datasets. + +An ontology for the standards-development process -- for example top-down vs +bottom-up, minimum number of datasets, target community size and technical +expertise typical of this community, and so forth -- could help guide the +standards-development process towards more effective adoption and use. A set of +meta-standards and high-level descriptions of the standards-development process +-- some of which is laid out in this report -- could help standard developers +avoid known pitfalls, such as the dreaded proliferation of standards, or such +as complexity-impeded adoption. Surveying and documenting the success and +failures of current standards for a specific dataset / domain can help +disseminate knowledge about the standardization process. Resources such as +[Fairsharing](https://fairsharing.org/) or [Digital Curation +Center](https://www.dcc.ac.uk/guidance/standards) can help guide this process. + +### Develop standards in tandem with standards-associated software + +Development of standards should be coupled and tightly linked with development +of associated software. This produces a virtuous cycle where the use-cases and +technical issues that arise in software development informs the development of +the standard and vice versa. One of the lessons learned across a variety of +different standards is the importance of automated validation of the standard. +Automated validation is broadly seen as a requirement for the adoption of a +standard and a factor in managing change of the standard over time. To advance +this virtuous cycle, we recommend to make data standards machine readable, and +make software creation an integral part of establishing a standard's schema. +Additionally, standards evolution should maintain software compatibility, and +ability to translate and migrate between standards. + + +## Policy-making and funding entities: + +### Fund the development of open-source standards While some funding agencies already support standards development as part of the development of informatics infrastructures, data standards development @@ -16,13 +92,18 @@ community efforts and tools for this. The OSS model is seen as a particularly promising avenue for an investment of resources, because it builds on previously-developed procedures and technical infrastructure and because it provides avenues for the democratization of development processes and for -community input along the way. The clarity offered by procedures for -enhancement proposals and semantic versioning schemes adopted in standards -development offers avenues for a range of stakeholders to propose well-defined +community input along the way. At the same time, there is significant +challenges associated with incentives to engage, ranging from the dilution of +credit to individual contributors, and ranging through the burnout of +maintainers and developers. The clarity offered by procedures for enhancement +proposals and semantic versioning schemes adopted in standards development +offers avenues for a range of stakeholders to propose well-defined contributions to large and field-wide standards efforts (e.g., -[@pestilli2021community]). +[@pestilli2021community]), and potentially helps alleviate some of these +concerns by providing avenues for individual contributions to surface, as well +as clarity of process, which can alleviate the risks of maintainer burnout. -### Invest in Data Stewards +### Invest in data stewards Advancing the development and adoption of open-source standards requires the dissemination of knowledge to researchers in a variety of fields, but this @@ -43,9 +124,14 @@ methodology of OSS. This does not mean that they need to become software engineers -- though for some of them there may be some overlap with the role of research software engineers [@Connolly2023Software] -- but rather that they need to become familiar with those parts of the OSS development life-cycle that -are specifically useful for the development of open-source standards. For example, tools for version control, tools for versioning, and tools for creation and validation of compliant data and metadata. +are specifically useful for the development of open-source standards. For +example, tools for version control, tools for versioning, and tools for +creation and validation of compliant data and metadata. -### Review Data Standards Pathways +Stakeholder organizations should invest in training grants to establish +curriculum for data and metadata standards education. + +### Review open-source standards pathways Invest in programs that examine retrospective pathways for establishing data standards. Encourage publication of lifecycles for successful data standards. @@ -57,73 +143,16 @@ step of the work of standards creators and granting agencies. In the meanwhile, it would be good to also retroactively document the lifecycle of existing standards that are seen as success stories. Research on the principles that underlie successful open-source standards development can be used to formulate -new standards and iterate on existing ones. - -### Establish Governance +new standards and iterate on existing ones. Data management plans should +promote the sharing of not only data, but also metadata and descriptions of how +to use it. -Establish governance for standards creation and adoption, especially for -communities beyond a certain size that need to converge toward a new standard -or rely on an existing standard. Review existing governance practices such as -[TheOpenSourceWay](https://www.theopensourceway.org/the_open_source_way-guidebook-2.0.html#_project_and_community_governance). -Data management plans should promote the sharing of not only data, but also -metadata and descriptions of how to use it. - - -### Program Manage Cross Sector alliances +### Manage Cross Sector alliances Encourage cross-sector and cross-domain alliances that can impact successful standards creation. Invest in robust program management of these alliances to -align pace and create incentives (for instance via Open Source Program Office / -OSPO efforts). Similar to program officers at funding agencies, standards -evolution need sustained PM efforts. Multi company partnerships should include -strategic initiatives for standard establishment e.g. -[Pistoiaalliance](https://www.pistoiaalliance.org/news/press-release-pistoia-alliance-launches-idmp-1-0/). - - -### Curriculum Development - -Stakeholder organizations should invest in training grants to establish curriculum for data and metadata standards education. - -## Science and Technology Communities: - -### User-Driven Development - -Standards should be needs-driven and developed in close collaboration with users. Changes and enhancements should be in response to community feedback. - -### Meta-Standards development - -In surveying the landscape of existing standards, a readiness/maturity model -can be developed that assesses the challenges and opportunities that a specific -standard faces. This process in itself can be standardized to develop -meta-standards or standards-of-standards. These are the succinct descriptions -of cross-cutting best-practices that can be used as a basis for the analysis or -assessment of an existing standard, or as guidelines to develop new standards. -For instance, barriers to adopting a data standard irrespective of team size -and technological capabilities should be considered. Meta-standards should -include formalization for versioning of standards and interactions with -specific related software. Aspects of communication with potential user -audiences (e.g., researchers in particular domains) should be taken into -account as well. For example, in the quality of onboarding documentation and -tools for ingestion or conversion into standards-compliant datasets. Relatedly, -it would be good to create an ontology for standards process such as top down -vs bottom up, minimum number of datasets, target community size and technical -expertise typical of this community, etc. This ontology can help guide the -standards-development process towards more effective adoption and use. - - -### Formalization Guidelines - -Amplify formalization/guidelines on how to create standards (example metadata schema specifications using [LinkML](https://linkml.io). - -### Landscape and Failure Analysis - -Before establishing a new standard, survey and document failure of current standards for a specific dataset / domain. Use resources such as [Fairsharing](https://fairsharing.org/) or [Digital Curation Center](https://www.dcc.ac.uk/guidance/standards). - -### Machine Readability - -Development of standards should be coupled with development of associated software. Make data standards machine readable, and software creation an integral part of establishing a standard's schema e.g. For identifiers for a person using CFF in citations, cffconvert software makes the CFF standard usable and useful. -Additionally, standards evolution should maintain software compatibility, and ability to translate and migrate between standards. - - - - +align pace and create incentives (for instance via Open Source Program Offices +at Universities or other research organizations). Similar to program officers +at funding agencies, standards evolution need sustained PM efforts. +Multi-company partnerships should include strategic initiatives for standard +establishment such as the Pistoia Alliance (https://www.pistoiaalliance.org/).