Commit 63b0b3c0 authored by PTSEFTON's avatar PTSEFTON
Browse files

Pairtree stuff working now - still need to test on single-file output

parent 7f420ea0
This diff is collapsed.
# CalcyteJS
This is a work-in-progress port of the python-based [Calcyte tool](https://codeine.research.uts.edu.au/eresearch/calcyte).
## Status
This is Beta code.
## About
Calcyte is a toolkit to implement the [DataCrate] specification:
1. Managing metadata for collections of content via automatically generated
spreadsheets, to create CATALOG.json files
2. Generating HTML from DataCrate CATALOG.json files.
3. Packaging data in BagIt format, and optionally zipping it.
Calcyte targets the [Draft DataCrate Packaging format v0.3](https://github.com/UTS-eResearch/datacrate/blob/master/spec/0.3/data_crate_specification_v0.3.md).
## Installation
- Install [node.js](https://nodejs.org/en/)
- Install the [BagIt](https://github.com/LibraryOfCongress/bagit-java)
`brew install bagit`
- Install Siegfreid using the [instructions](https://github.com/richardlehane/siegfried/wiki/Getting-started).
- Get the code:
git clone https://code.research.uts.edu.au/eresearch/CalcyteJS.git
- Link the binary for development use:
npm link
## Usage / instructions
Usage:
```
> ./calcyfy
Usage: calcyfy [options] <directories...>
To run calcyfy on a group of directories pass it a list of directories
Options:
-V, --version output the version number
-g, --generate-html Generate HTML from a "CATALOG.json" in a directory
-b, --bag [bag-dir] Create Bagit Bag(s) under [bag-dir])
-n, --no No Citation - only applies ith --bag
-z, --zip Create a zipped version of the bag - only applies with --bag
-d, --depth Maximum depth to recurse into directories
-r, --recurse Recurse into directories - up to 10
-m, --multiple Output multiple files instead of a single CATALOG.html
-h, --help output usage information
```
To run Calcyte on a group of directories pass it a list of directories
One directory:
```
calcyfy test_data/Glop_Pot
```
This will create a CATALOG.json file and CATALOG.html file in test_data/Glop_Pot
All the sample directories:
```
calcyfy test_data/*
```
Calcyte will generate:
- a CATALOG\_$dir.xlsx file in each directory (this is for humans to fill in with
metadata about the data)
- An index.html file summarizing the data using metadata from CATALOG\_$dir.xlsx
- A CATALOG.json file containing JSON-LD metadata derived from the CATALOG\* files plus some basic file-format information.
See the examples in `test_data`.
TODO: Instructions for filling in the CATALOG files.
[datacrate]: https://github.com/UTS-eResearch/datacrate
......@@ -114,7 +114,7 @@ if (program.generateHtml) {
var index_maker = new Index();
index_maker.init(
catalog,
path.join(dir, html_file_name),
path.join(dir),
multiple,
path.join(__dirname, "defaults/catalog_template.html")
);
......@@ -193,7 +193,7 @@ if (program.generateHtml) {
var index_maker = new Index();
index_maker.init(
path.join(output_dir, catalog_file_name),
path.join(output_dir, html_file_name),
path.join(output_dir),
multiple,
path.join(__dirname, "defaults/catalog_template.html")
);
......
// Type definitions for calcyte 0.0.2
// Project: DataCrate
// Definitions by: Mike Lynch
declare class Collection {
constructor ();
children: Item[]; // check
rel_path: string;
items: Item[];
name_lookup: any;
id_lookup: any;
json_ld: Object;
field_names_by_type: any;
existing_catalogs: any;
root_node: any;
get_unique_catalog_name(dir: string, existing_catalogs?: string[]): string;
index_graph(): void;
generate_bag_info(): void;
save_bag_info(): void;
update():void;
to_html():void;
to_json(graph: Object):void;
bag:(bag_dir: string):void;
to_json_ld(): Promise<void>;
read(dir:string, rel_path: string, parent:Collection):void;
}
module.exports = {
Collection: require('./lib/collection.js'),
Index: require('./lib/index_html.js'),
Datacite: require('./lib/datacite.js'),
Bag: require('./lib/bag.js')
}
......@@ -17,7 +17,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
/* Defaults for Calcyte such as names of key files */
const html_multi_file_dirs = ["CATALOG"];
const html_multi_file_dirs = "CATALOG_files";
const catalog_root_name = "CATALOG";
const datacite_file_name = "datacite.xml";
const catalog_json_file_name = `${catalog_root_name}.json`;
......@@ -27,7 +27,7 @@ const max_files_in_dir = "100"; // Don't list files in a directory if there are
const ignore_file_regex = new RegExp(
`(^${html_multi_file_dirs[0]}$)|(^~)|(^\\.)|(${datacite_file_name})`
);
const ignore_dir_regex = new RegExp(`(^${html_multi_file_dirs[0]}$)|(^\\.)`);
const ignore_dir_regex = new RegExp(`(^${html_multi_file_dirs}$)|(^\\.)`);
const BagIt_Profile_Identifier =
"https://raw.githubusercontent.com/UTS-eResearch/datacrate/master/spec/0.3/profile-datacrate-v0.3.json";
const DataCrate_Specification_Identifier =
......
......@@ -21,8 +21,10 @@ var ejs = require("ejs");
context = require("../defaults/context.json");
const path = require("path");
const shell = require("shelljs");
const jsonld = require("jsonld");
const filesize = require("filesize");
const jsonld = require("../lib/jsonldhelper.js");
const helper = new jsonld();
const display_keys = [
"name",
"@type",
......@@ -173,13 +175,12 @@ module.exports = function () {
var zip_link;
var name = this.root_node["name"];
var json = JSON.stringify(node, null, 2);
if (this.first_page) {
zip_link = this.zip_path
? "<a href='" + this.zip_path + "'>Download a zip file</a>"
: "";
var catalog_actual_path = path.join(
path.dirname(out_path),
path.dirname(this.out_dir),
defaults.catalog_json_file_name
);
var stats = fs.statSync(catalog_actual_path);
......@@ -192,12 +193,14 @@ module.exports = function () {
up_link = `<a href="" class="active"><button type="button" class="btn btn-default btn-sm"><span class="glyphicon glyphicon-home"></span>&nbsp;${name}</button></a>`;
}
} else if (this.multiple_files_dir) {
var href = this.get_href(this.root_node["@id"]);
var href = this.get_href(this.root_node["@id"], node["@id"]);
up_link = `<a href=${href}><button type="button" class="btn btn-default btn-sm"><span class="glyphicon glyphicon-home"></span>&nbsp;${name}</button></a> `;
}
var time = new Date().toISOString();
out_path = path.join(this.out_dir, this.get_html_path(node["@id"]))
shell.mkdir("-p", path.dirname(out_path));
fs.writeFileSync(
out_path,
this.template({
......@@ -292,10 +295,10 @@ module.exports = function () {
this.item_by_id[part["@id"]]
) {
td_ele += ele("img", {
src: this.get_file_ref(this.item_by_id[part["@id"]]["path"])
src: this.get_file_ref(this.item_by_id[part["@id"]]["path"], item["@id"])
});
} else if (k === "path") {
td_ele += ele("a", { href: encodeURI(this.get_file_ref(part)) });
td_ele += ele("a", { href: encodeURI(this.get_file_ref(part, item["@id"])) });
td_ele += part
.replace(/\/$/, "")
.split("/")
......@@ -345,7 +348,7 @@ module.exports = function () {
var target_name = this.item_by_id[part["@id"]].name
? this.item_by_id[part["@id"]].name
: part["@id"];
var href = this.get_href(part["@id"]);
var href = this.get_href(part["@id"], item["@id"]);
td_ele += ele("a", { href: href });
td_ele += target_name;
td_ele += close("a");
......@@ -357,36 +360,55 @@ module.exports = function () {
get_up_path: function get_up_path(path) {
return "../".repeat(defaults.html_multi_file_dirs.length) + path;
},
get_file_ref: function get_file_ref(path) {
get_file_ref: function get_file_ref(dest_path, from_id) {
if (this.multiple_files_dir && !this.first_page) {
return this.get_up_path(path);
var source_path = this.get_html_path(from_id);
return path.relative(path.dirname(source_path), dest_path);
} else {
return path;
return dest_path;
}
},
get_href: function get_href(id) {
var path;
if (this.item_by_id[id]["path"]) {
path = this.item_by_id[id]["path"];
if (Array.isArray(path)) {
path = path[0];
}
}
if (this.multiple_files_dir) {
var link = "./";
if (path === "./" || path === "data/") {
link = this.get_up_path(defaults.html_file_name);
} else {
if (this.first_page) {
link += defaults.html_multi_file_dirs.join("/") + "/";
}
// TODO: fix this appalling hack and work out a better system of filenames!
link += id.replace(/\//g, "_").replace("?", "%3F") + ".html";
}
return link;
} else {
get_html_path: function get_html_path(id) {
if (this.item_by_id[id] && this.item_by_id[id]["path"]) {
var actual_path = helper.value_as_array(this.item_by_id[id]["path"])[0];
if (actual_path === "./" || actual_path === "data/") {
return defaults.html_file_name;
}
}
var p = "";
p += defaults.html_multi_file_dirs + "/";
p += "pairtree_root/";
id = id.replace(/\^/g, "^5e")
.replace(/ /g, "^20")
.replace(/"/g, "^22")
.replace(/</g, "^3c")
.replace(/\\/g, "^5c")
.replace(/\*/g, "^2a")
.replace(/\=/g, "^3d")
.replace(/\+/g, "^2b")
.replace(/>/g, "^3e")
.replace(/\|/g, "^7c")
.replace(/,/g, "^2c")
.replace(/\?/g, "^3f")
.replace(/\//g, "=")
.replace(/\:/g, "+")
.replace(/\./g,",")
p += id.replace(/(..)/g, '$1/').replace(/([^\/])$/, "$1/")
p += "index.html"
return p
},
get_href: function get_href(id, from_id) {
if (!this.multiple_files_dir) {
return "#" + id;
}
var dest_path = this.get_html_path(id);
var source_path = this.get_html_path(from_id);
return path.relative(path.dirname(source_path), dest_path);
},
format_cell: function (item, k) {
......@@ -510,10 +532,7 @@ module.exports = function () {
if (this.first_page) {
out_path = this.out_path;
} else {
out_path = path.join(
this.out_dir,
node["@id"].replace(/\//g, "_") + ".html"
);
out_path = this.get_html_path(node["@id"]);
cite = node["name"];
}
......@@ -573,7 +592,7 @@ module.exports = function () {
html += ele("iframe", {
width: "80%",
height: "90%",
src: this.get_file_ref(readme.path),
src: this.get_file_ref(readme.path, node["@id"]),
border: 1
});
html += close("iframe");
......@@ -602,21 +621,12 @@ module.exports = function () {
this.template = ejs.compile(temp);
}
this.out_path = out_path;
this.out_dir = out_path;
this.first_page = true;
this.multiple_files_dir = multiple_files;
if (multiple_files) {
this.multiple_files_dir = defaults.html_multi_file_dirs.join("/"); // Where to write out the
var out_dir = path.dirname(out_path);
for (let d of defaults.html_multi_file_dirs) {
out_dir += "/" + d;
shell.mkdir("-p", out_dir);
}
shell.rm(out_dir + "/*");
this.out_dir = out_dir;
} else {
this.multiple_files_dir = false;
shell.rm("-rf", path.join(out_path, defaults.html_multi_file_dirs, "*"))
}
this.first_page = true;
// TODO: Use loadjson or somesuch
// Shift loading into the calcyte script
if (!crate_data["@graph"]) {
......
......@@ -23,6 +23,9 @@ const context = require("../defaults/context.json");
const metadata_property_name = require("./property.js");
const uuidv4 = require("uuid/v4");
const ejs = require("ejs");
const jsonld = require("../lib/jsonldhelper.js");
const helper = new jsonld();
module.exports = function() {
this.metadata = {};
this.json_ld = {};
......@@ -113,6 +116,14 @@ module.exports = function() {
if (this.collection.bagged && frag.path) {
frag.path = path.join("data", frag.path);
}
var ident = helper.value_as_array(this.id)
for (let id of ident){
if (id && id.match(/(https?:\/\/)(dx\.)?(doi.org\/10\.)/i)) {
helper.push_value(frag, "identifier", id.replace(/https?:\/\//i, ""))
}
}
this.json_ld_fragment = frag;
//console.log(frag);
return frag;
......@@ -136,6 +147,9 @@ module.exports = function() {
//console.log("FILE ", this.collection.path, value);
this.id = path.join(this.collection.rel_path, value);
this.types.push("File");
var pr = new metadata_property_name();
pr.parse("path", this.id);
this.properties[pr.name] = pr;
this.is_file = value;
} else if (property.is_id) {
//console.log("Got an ID", value);
......
......@@ -75,6 +75,7 @@ module.exports = function () {
}
this.json_ld["@context"] = new_context
},
reference_to_item: function (node) {
// Check if node is a reference to something else
// If it is, return the something else
......@@ -87,6 +88,9 @@ module.exports = function () {
},
value_as_array: function (value) {
if (!value) {
return []
}
if (!Array.isArray(value)) {
return [value];
console.log("Making array", key, item[key])
......@@ -95,6 +99,20 @@ module.exports = function () {
}
},
push_value: function(item, key, value) {
if (item[key]) {
if (!Array.isArray(item[key])) {
item[key] = [item[key]]
}
item[key].push(value)
}
else {
item[key] = [value]
}
},
init: function init(json) {
this.json_ld = json;
this.item_by_id = {};
......
This diff is collapsed.
// Type definitions for calcyte 0.0.2
// Project: DataCrate
// Definitions by: Mike Lynch
declare class Collection {
constructor();
property: any;
method(any): void;
}
/* This is part of Calcyte a tool for implementing the DataCrate data packaging
spec. Copyright (C) 2018 University of Technology Sydney
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/* Test for collection.js */
const Collection = require("../lib/collection.js");
const assert = require("assert");
const shell = require("shelljs");
var fs = require("fs");
var path = require("path");
const XLSX = require("xlsx");
const Datacite = require("../lib/datacite.js");
const fixtures = require("./fixtures");
describe("Create unique CATALOG.xlsx filenames", function() {
it("Should create nice names", function(done) {
var c = new Collection();
assert.equal(
c.get_unique_catalog_name("/home/me/my_dir", (existing_catalogs = [])),
"CATALOG_my_dir.xlsx"
);
assert.equal(
c.get_unique_catalog_name(
"/home/me/my_dir",
(existing_catalogs = ["CATALOG_my_dir.xlsx"])
),
"CATALOG_my_dir_1.xlsx"
);
assert.equal(
c.get_unique_catalog_name(
"/home/me/my_dir",
(existing_catalogs = ["CATALOG_my_dir.xlsx", "CATALOG_my_dir_1.xlsx"])
),
"CATALOG_my_dir_2.xlsx"
);
assert.equal(
c.get_unique_catalog_name("/home/me/my dir/", (existing_catalogs = [])),
"CATALOG_my_dir.xlsx"
);
done();
});
});
describe("Create a CATALOG", function() {
var test_path;
before(function () { test_path = fixtures.buildup(); });
after(function () { fixtures.teardown(); });
it("Should create a catalog from scratch for test files", function() {
this.timeout(15000);
var c = new Collection();
c.read(test_path, "./", false, 1000);
var sheet_json = XLSX.utils.sheet_to_json(c.workbook.Sheets["Files"]);
assert.equal(sheet_json[0]["FILE:Filename"], "1.pdf");
fixtures.remove_test_file("1.pdf");
var c = new Collection();
c.read(test_path, "./", false, 1000);
sheet_json = XLSX.utils.sheet_to_json(c.workbook.Sheets["Files"]);
//console.log(sheet_json);
// 1.pdf deleted but still in spreadsheet
assert.equal(sheet_json[0]["FILE:Filename"], "1.pdf");
assert.equal(sheet_json[0]["*MISSING-FILE*"], "1");
return c.to_json_ld().then(
function() {
//console.log(JSON.stringify(c.json_ld, null, 2));
assert.equal(c.item_by_id["./"]["hasPart"].length, 11);
assert(!c.item_by_id["./1.pdf"]);
assert(c.item_by_id["1.sh"]);
},
function(err) {
console.log(err);
}
);
});
});
describe("Create MANY CATALOGS", function() {
var test_path;
before(function () { test_path = fixtures.buildup(); });
after(function () { fixtures.teardown(); });
it("Should create appropriately named catalogs", function(done) {
this.timeout(15000);
shell.mkdir("-p", path.join(test_path, "a", "a", "a", "b"));
shell.mkdir("-p", path.join(test_path, "b", "a", "a", "b"));
console.log(test_path);
var c = new Collection();
c.read(test_path, "./", false, 1000);
console.log(c.existing_catalogs);
//console.log(shell.test('-e', path.join(test_path,"a","CATALOG_a.xlsx")))
assert(shell.test("-e", path.join(test_path, "a", "CATALOG_a.xlsx")));
assert(
shell.test("-e", path.join(test_path, "a", "a", "CATALOG_a_1.xlsx"))
);
assert(
shell.test("-e", path.join(test_path, "a", "a", "a", "CATALOG_a_2.xlsx"))
);
assert(
shell.test("-e", path.join(test_path, "b", "a", "a", "CATALOG_a_4.xlsx"))
);
done();
});
});
describe("Create SOME CATALOGS", function() {
var test_path;
before(function () { test_path = fixtures.buildup(); });
after(function () { fixtures.teardown(); });
it("Should create appropriately named catalogs but only to a depth of two", function(done) {
this.timeout(15000);
shell.mkdir("-p", path.join(test_path, "a", "a", "a", "b"));
shell.mkdir("-p", path.join(test_path, "b", "a", "a", "b"));
var c = new Collection();
c.read(test_path, "./", false, 3);
//console.log(shell.test('-e', path.join(test_path,"a","CATALOG_a.xlsx")))
assert(shell.test("-e", path.join(test_path, "a", "CATALOG_a.xlsx")));
assert(
shell.test("-e", path.join(test_path, "a", "a", "CATALOG_a_1.xlsx"))
);
assert(
!shell.test("-e", path.join(test_path, "a", "a", "a", "CATALOG_a_2.xlsx"))
);
assert(