diff --git a/bibliography.bib b/bibliography.bib index 5bee2812e9d627a0c8ff490de761ed7464851065..20e00da0552a06b166d07196cbe50d02c3d52cb3 100644 --- a/bibliography.bib +++ b/bibliography.bib @@ -1,10 +1,143 @@ -@misc{djangoproject_models_2016, - title = {Models and Databases | {{Django}} Documentation | {{Django}}}, - timestamp = {2016-12-19T03:31:30Z}, - urldate = {2016-12-19}, - howpublished = {\url{https://docs.djangoproject.com/en/1.10/topics/db/}}, - author = {{djangoproject}}, - month = dec, - year = {2016} + +@book{groupRDFaCoreRecommendation2012, + title = {{{RDFa Core}} 1.1 {{Recommendation}}}, + author = {Group, W3C Working and {others}}, + year = {2012}, + url = {http://www.w3.org/TR/rdfa-syntax/} +} + +@misc{kunzeBagItFilePackaging, + title = {The {{BagIt File Packaging Format}} ({{V0}}.97)}, + urldate = {2013-03-01}, + howpublished = {http://tools.ietf.org/html/draft-kunze-bagit-06}, + author = {Kunze, John and Boyko, Andy and Vargas, Brian and Madden, Liz and Littman, Justin}, + file = {/Users/124411/Zotero/storage/G69D6B5W/draft-kunze-bagit-06.html}, + note = {(accessed 2013-03-01T04:15:25Z)}, + url = {http://tools.ietf.org/html/draft-kunze-bagit-06} +} + +@inproceedings{seftonDataCrateFormalisingWays2017, + address = {Brisbane}, + title = {{{DataCrate}}: {{Formalising}} Ways of Packaging Research Data for Re-Use and Dissemination}, + abstract = {In 2013 Peter Sefton and Peter Bugeia presented at eResearch Australasia on a format for packaging research data(1), using standards based metadata, with one innovative feature \textendash{} instead of including metadata in a machine readable format only, each data package came with an HTML file that contained both human and machine readable metadata, via RDFa, which allows semantic assertions to be embedded in a web page. + +Variations of this technique have been included in various software products over the last few years, but the there was no agreed standard on which vocabularies to use for metadata, or specification of how the files fitted together. + + +THE PRESENTATION + +This presentation will describe work in progress on the DataCrate specification(2), illustrated with examples, including a tool to create DataCrate. We will also discuss other work in this area, including Research Object Bundles (3) and DataConservency(4) packaging. + +We will be seeking feedback from the community on this work should it continue? Is it useful? Who can help out? + +The DataCrate spec: + + Has both human and machine readable metadata at a package (data set/collection) level as well as at a file level + Allows for and encourages inclusion of contextual metadata such as descriptions of organisations, facilities, experiments and people linked to files with meaningful relationships (eg to say a file was created by a particular machine, as part of a particular experiment, at an organisation). + Is a BagIt profile(5). BagIt(6) is a simple packaging standard for file-based data. + Has a README.html tag file at the root with bagit-style metadata about the distribution (contact details etc) with a link to; + a CATALOG.html file in RDFa, using schema.org metadata inside the payload (data) dir with detailed information about the files in the package, and a redundant CATALOG.json in JSON-LD format + Is extensible easily as it is based on RDF.}, + booktitle = {{{eResearch Australasia}} 2017}, + author = {Sefton, Peter}, + year = {2017}, + url = {http://ptsefton.com/2017/10/19/datacrate.htm} +} + +@techreport{ruestBagItProfilesSpecification2017, + title = {{{BagIt Profiles Specification}}}, + author = {Ruest, Nick}, + month = jun, + year = {2017}, + file = {/Users/124411/Zotero/storage/NTNVAK4H/bagit-profiles.html}, + url = {https://github.com/ruebot/bagit-profiles} +} + +@misc{ResearchObjectBundle, + title = {Research {{Object Bundle}}}, + urldate = {2017-06-16}, + howpublished = {https://researchobject.github.io/specifications/bundle/}, + file = {/Users/124411/Zotero/storage/N5C3F2SW/bundle.html}, + note = {(accessed 2017-06-16T01:16:23Z)}, + url = {https://researchobject.github.io/specifications/bundle/} +} + +@misc{DatagovukDcatusage, + title = {Datagovuk/Dcat-Usage}, + abstract = {dcat-usage - Discussion of how to use DCAT with examples}, + urldate = {2017-03-28}, + howpublished = {https://github.com/datagovuk/dcat-usage}, + journal = {GitHub}, + file = {/Users/124411/Zotero/storage/HQSJCWWS/DCAT-Examples.html}, + note = {(accessed 2017-03-28T20:33:26Z)}, + url = {https://github.com/datagovuk/dcat-usage} +} + +@inproceedings{seftonIntroducingNextYear2013, + address = {Brisbane, Australia}, + title = {Introducing next Year's Model, the Data-Crate; Applied Standards for Data-Set Packaging}, + abstract = {In this paper we look at existing generic options available for storing research data in a way that encourages reuse and discoverability, both at the level of data files, and sets of data files, and describe some original work bringing together existing standards and metadata schemas to make well-described, reusable data sets that can be distributed as single files ('crates'). We focus on both formats for individual files and packaging formats, looking at how to best represent research data so that it can be disseminated and archived with as much context and provenance as possible. We look at some of the issues in choosing a file format in which to archive and disseminate data, and discuss techniques for adding contextual information which is both human-readable and machine-readable.}, + booktitle = {{{eResearch Australasia}} 2013}, + author = {Sefton, Peter and Bugeia, Peter}, + year = {2013}, + url = {http://eresearchau.files.wordpress.com/2013/08/eresau2013_submission_57.pdf} +} + +@inproceedings{seftonPickPackagePublish2014, + address = {Melbourne}, + title = {Pick, {{Package}} and {{Publish}} Research Data: {{Cr8it}} and {{Of The Web}}}, + abstract = {This presentation is about two complementary open source software products that have been +produced by a consortium of partners including the University of Western Sydney, the University +of Newcastle and Intersect Australia. These products are designed to bridge the gap between +easily$\-$accessible dropbox.com$\-$style working$\-$data file$\-$sharing and synchronization and the +publishing and archiving of mature research data sets. Cr8it (crate$\-$it) is a file packaging and +publishing application that lets users package data together with metadata in order to maximise +its future potential for reuse using Data Crates $\-$ the crate format is designed to include as much +metadata as possible to maximise data usefulness, plus the actual data payload. Cr8it is a +plugin for the open source file$\-$synchronization and sharing platform ownCloud, becoming +familiar to Australian researchers from the AARNet Cloudstor+ product. Of The Web (OTW) is a +toolkit for extracting metadata from generic and domain$\-$specific file formats, and creating +web$\-$previews, for example extracting time series data from proprietary formats, and creating +summary web$\-$pages with. Both products are built on a wide range of open source componentry.}, + booktitle = {{{eResearch Australasia}} 2014}, + author = {Sefton, Peter and Bugeia, Peter and Picasso, Vicki}, + month = oct, + year = {2014}, + url = {http://eresearchau.files.wordpress.com/2014/07/eresau2014_submission_30.pdf} } +@article{RDFa2014, + title = {{{RDFa}}}, + copyright = {Creative Commons Attribution-ShareAlike License}, + abstract = {RDFa (or Resource Description Framework in Attributes[1]) is a W3C Recommendation that adds a set of attribute-level extensions to HTML, XHTML and various XML-based document types for embedding rich metadata within Web documents. The RDF data-model mapping enables its use for embedding RDF subject-predicate-object expressions within XHTML documents. It also enables the extraction of RDF model triples by compliant user agents.}, + language = {en}, + urldate = {2014-02-26}, + journal = {Wikipedia, the free encyclopedia}, + month = feb, + year = {2014}, + file = {/Users/124411/Zotero/storage/B3QR5GD7/2014 - RDFa.html}, + note = {(accessed 2014-02-26T21:05:23Z)}, + url = {http://en.wikipedia.org/w/index.php?title=RDFa\\\&oldid=592600280} +} + +@inproceedings{wheelerEndtoEndResearchData2018, + address = {Taipei}, + title = {End-to-{{End Research Data Management}} for the {{Responsible Conduct}} of {{Research}} at the {{University}} of {{Technology Sydney}}}, + language = {en}, + urldate = {2018-07-10}, + publisher = {{APRI}}, + author = {Wheeler, Louise and Wise, Sharyn and Sefton, Peter}, + month = jul, + year = {2018}, + note = {(accessed 2018-07-10T22:37:36Z)}, + url = {https://eresearch.uts.edu.au/2018/07/04/APRI_2018_provisioner.htm} +} + +@article{soiland-reyesResearchObjectBundle2014, + title = {Research Object Bundle 1.0}, + journal = {Specification, researchobject. org}, + author = {{Soiland-Reyes}, Stian and Gamble, Matthew and Haines, Robert}, + year = {2014} +} + + diff --git a/bibliography.csl b/bibliography.csl index 9d967b0bf10d739b0ddc11e423f7559dbfe531e1..2460b9c44dadfcb58854c31e8cd9afaa3fe9e8df 100644 --- a/bibliography.csl +++ b/bibliography.csl @@ -167,8 +167,7 @@ </macro> <macro name="access"> <choose> - <if type="webpage"> - <choose> + <if variable="URL"> <group delimiter=". "> <text term="online" prefix="[" suffix="]" text-case="capitalize-first"/> @@ -187,8 +186,7 @@ </group> </if> </choose> - </if> - </choose> + </macro> <macro name="page"> <group> @@ -274,7 +272,7 @@ <text variable="container-title" font-style="italic"/> <text macro="issued"/> </group> - <text macro="access"/> + </else-if> <else-if type="patent"> <group delimiter=", "> @@ -334,6 +332,8 @@ </group> </else> </choose> + + <text macro="access"/> </layout> </bibliography> -</style> \ No newline at end of file +</style> diff --git a/build/paper.pdf b/build/paper.pdf index 6a5a3289edae57a14e8582cf71fb51d0af9ee3c9..706198ba9cfd31f3a1b2e999f3c9faa026696767 100644 Binary files a/build/paper.pdf and b/build/paper.pdf differ diff --git a/makefile b/makefile index 6359a42a52c5641329ec2a14fd90e333878bad17..55c1be655c3b069f58dff6dff121dc237fcfdea9 100644 --- a/makefile +++ b/makefile @@ -17,4 +17,3 @@ pdf: clean: rm build/* - diff --git a/metadata.yaml b/metadata.yaml index f8ddc05d79cddb765eeda46ecfd763bd6ea08c23..d45f8a0162f75768e5036a02a02d9be5f6cf208d 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -1,21 +1,20 @@ --- -title: 'Paper Title' +title: 'DataCrate - ' author: - - name: Santos Gallegos - affilation: Universidad de Cuenca - location: Cuenca, Ecuador - email: santos_g@email.com + - name: Peter Sefton + affilation: University of Technology Sydney + location: Sydney, Austsralia + email: peter.sefton@uts.edu.au keywords: - - one - - two - - three + - Research Objects + - Data Packaging + - Metadata numbersections: yes lang: en babel-lang: english abstract: | - This is the abstract. + DataCrate is a ... - It consists of two paragraphs. + And... ... - diff --git a/paper.md b/paper.md index 5b6fd86ab4f535e4b16639fa97f4e87326a08643..ad193ce59e7cbb38afefd8bf6ac8856dc3f4a3df 100644 --- a/paper.md +++ b/paper.md @@ -1,45 +1,157 @@ +# Note to reviewers + +The specification here is currently in draft at v0.2. A version one release is +planned for October 2018. If accepted this paper would be updated for +presentation and subsequent publication. + # Introduction -An introduction. Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed -do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim -veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo -consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse -cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non -proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +In illuminating the term *Research Object* the call for proposals for Research +Object 2018 uses the phrase "multi-part research outcomes with their context". +The DataCrate specification is a research data packaging and disemmination +specification designed to capture exactly this; outcomes (also inputs) and +context. It is designed to assemble files that represent arbitrary research +outcomes and inputs and context to make sense of them, which could be raw or +derived data or code with context, including but not limited to data about the +people, software and equipment used in the research as well as supporting +documents such as publications, funding agreements or README files. + +DataCrate specifies how to gather-together data in such a way that it can (a) be +packaged via Zip, tar, a disc image, a multi-art package like Rar TODO or (b) be +hosted on a web server or file share for inspection by potential users and/or +used directly on High Performance Computing systems or otherwise accessed and +analysed. + + +# Methodology + +The DataCrate specification grew out of two strands of work that implemented +very similar data packaging appoaches, using a combination of standards, but +without a formal specification. The first implementation was in the [HIEv system +at Western Sydney University], a data capture system for environmental science +data that captures data files produced by sensor networks and allows manual or +API-based upload of other files. Using a web interface researchers can select +files to export, for example to support an research article. + +The requirements / principles for the HIEv system were presented at eResearch Australasia [@seftonIntroducingNextYear2013]. + +> 1. The packaging format should not be data-format-sensitive +> 2. The packaging format should not be research domain specific +> 3. The packaging format should not be technology or platform specific +> 4. The data package should contain as much contextual information as possible +> 5. Metadata should be easily human and machine-readable +> 6. The package format should contain self-checking and verification features +> 7. The metadata format should be compatible with the semantic web by using +> URIs as names for things incuding metadata terms* +> 8. `7` implies using Linked-Data, but the project should not attempt to +> define and manage its own ontologies, for reasons of sustainability* +> 9. A data pacakage should be able to be displayed on the web - implying that +> the human readable metadata n `5` should be in HTML.* + +* The last three requiremnets were not explicit in the presentation but were discussed +in the team, and proved important in the developent of DataCrate. + +The engineering solution was to use the Bagit [@kunzeBagItFilePackaging] +packaging spec to cover requirements `6` - BagIt also doesn't get in the way of +any of the other requirements. + +The main innovation in HIEv's packaging was to add a machine-generated HTML file +that covered both `4` (as much context as possible) & `5` (human and machine +readable metadata). To do this, HIEv produces a summary of the context, with +information about the facilities used -- their name, nature and location -- and +technical details about the payload files, thus satisfying `5`, using RDFa [@RDFa2014] to +embed metadata in the HTML file. + + +[Cr8it] [@seftonPickPackagePublish2014] was another implementation that used the +same basic idea for data packages but which was never standardized. + +The first two implementations had no guidelines for what metadata to +use beyond what was hard-wired each code-base, so there was no hope of easy +interoperability or safe extensibility, and there were no repositories into +which data could be published, but feedback from the eResearch community and the +very limited number of researchers exposed to the systems was strong, so in 2016 +when UTS began work on a new Research Data Management service +[@wheelerEndtoEndResearchData2018], the DataCrate standard was born. -# Method +A team based at UTS, with some external collaborators started +a process to work out (a) was there an existing standard which met the +requirements? (b) If no, which RDF vocabularies should we use? and (c) the +mechanics of organising the files in the packages. -Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod -tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At -vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, -no sea takimata sanctus est Lorem ipsum dolor sit amet. +### Existing standards -Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod -tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At -vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, -no sea takimata sanctus est Lorem ipsum dolor sit amet. +We were not able to find any general-purpose packaging specification with +anything like the HTML+RDFa index that HIEv data packages have, allowing for +human and machine readable metadata, but our approach was to choose a base +standard that covered the other requirements and add to it. -# Footnotes +BagIt, which had been used in HIEv and Cr8it is an obvious standard on which to +base a packaging format - it is widely used in the research data community, +there is tooling available and it covers the integrity aspects of packaging +data. -Example of footnote^[A footnote example]. Lorem ipsum dolor sit amet, consectetur -adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna -aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi -ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in -voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint -occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim -id est laborum. +An alternative, Frictionless data packages [TODO ref], which use a simple JSON format as a +manifest can cover the basic packaging feature of BagIt and they have checksum +features built in. In their favour, frictionless data packages have the ability +to describe the headers in tabular data files. However, they do not meet the +requirement `7` of having linked-data metadata, so while the JSON metadata is +technically, machine readable, it is not easy to relate to the semantic web as +it does not use linked-data standards, and the terms are defined locally to the +specification. It was also unclear how to extend the specification, contrasting +with linked-data approaches which *automatically* allow extension by the use of +URIs. -# Cites +The other main alternative was the Research Object Bundle specification +[@soiland-reyesResearchObjectBundle2014]. At the time we started the DataCrate +work the Research Object domain name had expired, and the project looked to be +inactive. It has been re-instated, but this experience highlighted the risk +around adopting niche standards - which was a contributing factor in our +decision to use schema.org as the basic metadata standard for DataCrate. This +surfaced another implicit requirement, `8` - DataCrates needed to be useful even +after the potential disappearance of the original team, and given our resourcing +the team should not be involved in defining new ontologies. + +Research Object Bundle uses the Universal Container Format, whcih does not have +integrity features such as checksums although there has been [some work done on +aligning it with BagIt](https://github.com/ResearchObject/bagit-ro) this was not final (and still is not) we were evaluating RO and there is still a note about [incompatibilities](https://github.com/ResearchObject/bagit-ro#considerations). + +The initial version of DataCrate (v0.1) was developed to add a formal +specification of what the core metadata for a DataCrate should be. V1 persisted +with HTML+RDFa but this was cumbersome and was removed in favour of an approach +where the HTML page is generated from a JSON-LD page rather than the other way +around. + +After looking at a variety of standards, including Dublin Core [TODO REF] which +is very limited in coverage and DCAT [TODO REF] which is more complete for +describing data sets, but silent on the issue of describing files or other +contextual entities, using schema.org was judged by the team to be the best way to +meet our goal describing data sets, context and files. We settled on a +approach of recommending some other ontologies to use where schema.org has gaps +in its coverage. Schema.org is the most widely used linked-data vocabulary in the world [TODO: find a reference]. + +PCDM for repository content. + +SPAR ontologies for scholarly communications. + +TBA - what to do for scientific discipline metadata. + +# Implementation + +The specification. -Zotero + Better BibTex. All cites are on the file bibliography.bib. This is -a cite[@djangoproject_models_2016]. # Conclusion -Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod -tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At -vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, -no sea takimata sanctus est Lorem ipsum dolor sit amet. +Tool support for consuming JSON-LD is still limited in some respects. + + + + + # References + +[HIEv system at Western Sydney University]: https://www.westernsydney.edu.au/eresearch/home/projects/completed_projects/hiev +[Cr8it]: https://www.westernsydney.edu.au/eresearch/home/projects/cr8it