Commit 84f41b0f authored by PTSEFTON's avatar PTSEFTON
Browse files

Initial commit - lots of work still to do

parents
#!/usr/bin/env node
const Collection = require("./collection.js");
const args = require('minimist')(process.argv.slice(2));
const shell = require('shelljs')
var dirs = args._
console.log(dirs);
console.log(args.b);
dirs.forEach(function(dir) {
if (shell.test('-d', dir)) {
var c = new Collection();
c.read(dir);
c.to_json_ld().then(
function() {
c.to_html();
},
function(err) {
console.log(err)
});
}
});
var jsonld = require('jsonld');
var fs = require('fs');
var program = require('commander');
var XLSX = require('xlsx');
var path = require('path');
const ignore= /^\./;
const context = require("./defaults/context.json");
const Property = require("./property.js");
const Item = require("./item.js");
const uuidv4 = require('uuid/v4');
const shell = require("shelljs");
var fs = require('fs');
//const catalog_template = require("defaults/catalog_template.html");
const builder = require('xmlbuilder');
const Index = require('./index_html.js');
const Datacite = require('./datacite.js')
module.exports = function(){
this.collection_metadata = new Item();
this.children = [];
this.rel_path = "./";
this.items = [];
this.name_lookup = {};
this.id_lookup = {};
this.json_ld = {};
this.field_names_by_type = {};
this.existing_catalogs = [];
function get_collection_metadata(workbook, collection) {
// TODO - make the collection just another kind of item object
raw_collection_metadata = XLSX.utils.sheet_to_json(workbook.Sheets['Collection']);
var item_json = {}
for (var i=0; i < raw_collection_metadata.length ; i++) {
var name_value = raw_collection_metadata[i];
item_json[name_value['Name']] = name_value['Value'];
}
item_json["TYPE:"] = "Dataset";
item_json["path"] = collection.rel_path;
if (!(collection.rel_path === "./")) {
//console.log("Setting dataset based on path");
item_json["ID"] = collection.rel_path;
//item_json["TYPE:"] = "Dataset";
} else if (!item_json["ID"]) {
item_json["ID"] = collection.rel_path;
}
collection.collection_metadata.load_json(item_json, collection);
}
function get_metadata(workbook, collection, sheet_name) {
metadata = XLSX.utils.sheet_to_json(workbook.Sheets[sheet_name]);
for (var i=0; i < metadata.length ; i++) {
item_json = metadata[i];
//console.log("JSON", item_json)
item = new Item();
item.load_json(item_json, collection);
collection.items.push(item);
}
}
function flattenit(json, collection) {
var promises = jsonld.promises;
json["@context"] = context;
var promise = promises.flatten(json, context); //frame(json, frame);
return(promise);
}
return {
collection_metadata: this.collection_metadata,
children: this.children,
rel_path: this.rel_path,
dir: this.dir,
id_lookup: this.id_lookup,
name_lookup: this.name_lookup,
items : this.items,
json_ld : this.json_ld,
existing_catalogs : this.existing_catalogs,
get_unique_catalog_name: function get_unique_catalog_name(dir, existing_catalogs= []){
var index = 0;
dir = path.basename(dir).replace(" ","_");
var potential_catalog_filename =`CATALOG_${dir}.xlsx`;
while (existing_catalogs.includes(potential_catalog_filename)){
index += 1;
potential_catalog_filename = `CATALOG_${dir}_${index}.xlsx`;
//console.log(index, potential_catalog_filename);
}
//console.log(index, potential_catalog_filename)
return potential_catalog_filename;
},
to_html : function to_html() {
var index_maker = new Index();
index_maker.make_index_html(this.json_ld);
citer = new Datacite();
text_citation = citer.make_citation("./" + path.join(this.dir, "CATALOG.json"), path.join(this.dir, "index.html"));
index_maker.write_html("defaults/catalog_template.html", path.join(this.dir, "index.html"), text_citation);
},
to_json : function to_json(graph) {
if (!this.collection_metadata) {
this.collection_metadata = new Item();
}
var collection_json = this.collection_metadata.to_json_ld_fragment();
graph.push(collection_json);
//console.log("COLLECTION METADATA", json);
for (var [key, item] of Object.entries(this.items)) {
item_json = item.to_json_ld_fragment();
// Keep track of whether to add this to the graph
var exists = true;
//console.log("THINGS", item.id, item.5, item.name)
if (item.is_file) {
if (shell.test("-e", path.join(this.root_dir, item.id))) {
if (!collection_json['hasPart']) {
collection_json['hasPart'] = [];
}
collection_json['hasPart'].push(
{
"@id": item.id
})
}
else {
exists = false;
}
}
if (exists) {
graph.push(item_json);
}
}
//Sub collections
this.children.forEach(function(child) {
child.to_json(graph);
if (!collection_json['hasPart']) {
collection_json['hasPart'] = [];
}
//console.log("Pushing part", child.collection_metadata.id)
collection_json['hasPart'].push(
{
"@id": child.collection_metadata.id
//"@type": "@id"
})
});
},
to_json_ld : function to_json_ld() {
// Turn the entire collection into a JSON-LD document
json = {
'@graph' : [],
'@context' : context
};
this.to_json(json["@graph"]);
//console.log("JSON", JSON.stringify(json, null, 2));
json = JSON.parse(JSON.stringify(json));
//console.log(JSON.stringify(json, null, 2));
var collection = this;
promise = flattenit(json, this);
return promise.then(
function(flattenated) {
collection.json_ld = flattenated;
collection.json_by_id = {};
collection.json_by_url = {};
for (let iid = 0; iid < flattenated["@graph"].length; iid++) {
var item = flattenated["@graph"][iid];
collection.json_by_id[item["@id"]] = item;
if (item.path) {
collection.json_by_url[item.path] = item;
}
}
console.log("Writing in", collection.dir);
fs.writeFileSync(path.join(collection.dir, "CATALOG.json"), JSON.stringify(flattenated, null, 2),
function(err) {
if(err) {
return console.log(err, "Error wirting in", collection.dir);
}
log("The file was saved!" + path.join(collection.dir, "CATALOG.json"));
});
},
function(err) {console.log(err)});
},
read : function read(dir, rel_path = "./", parent) {
//console.log("existing", parent.existing_catalogs)
if (parent){
this.parent = parent
this.name_lookup = parent.name_lookup;
this.id_lookup = parent.id_lookup;
this.existing_catalogs = parent.existing_catalogs;
this.root_dir = parent.root_dir;
//console.log(this.existing_catalogs);
}
else {
this.name_lookup = {};
this.id_lookup = {};
this.existing_catalogs = [];
this.root_dir = dir;
}
this.children = [];
this.dir = dir;
this.rel_path = rel_path;
this.file_info = null;
//console.log("Lookup tables", this.name_lookup, this.id_lookup);
//console.log("dir", dir);
//console.log(('sf -nr -json "' + dir + '"'));
//console.log("file", JSON.stringify(this.file_info_by_filename, null, 2));
var items = fs.readdirSync(dir);
//console.log("These are the items", dir, items);
if (items) {
//console.log("ITEMS NOW", items);
//TODO - make this a testable function
var catalogs = items.filter(item => /^CATALOG.*xlsx$/.test(item))
this.existing_catalogs = this.existing_catalogs.concat(catalogs);
if (catalogs.length > 1) {
console.log("More than one catalog, using this one: ", catalogs[0])
}
items = items.filter(item => !(/^CATALOG.*(xlsx|html|json)$/.test(item)));
items = items.filter(item => !(/(^index.html$)|(^~)|(^\.)|(datacite.xml)/.test(item)));
items = items.filter(item => shell.test('-f',path.join(dir, item)));
//console.log("CATALOGS", catalogs)
//TODO - make this configurable
if (catalogs.length === 0){
//console.log("Making new catalog");
var catalog_file = !parent ? "CATALOG.xlsx" : "CATALOG_subdir.xlsx";
var new_catalog_file = this.get_unique_catalog_name(dir, this.existing_catalogs);
this.existing_catalogs.push(new_catalog_file);
//console.log("EXISTING AT THIS POINT", this.existing_catalogs);
catalogs = [new_catalog_file]
fs.writeFileSync(path.join(dir, new_catalog_file),
fs.readFileSync(path.join(__dirname,"defaults", catalog_file)));
//console.log("New Catalog", new_catalog_file);
//COPY IN A NEW CATALOG
//IF ROOT - use default
//ELSE sub catalog
}
if (catalogs.length > 0){
if (items.length < 100) {
this.file_info = JSON.parse(shell.exec('sf -nr -json "' + dir + '"').stdout);
//console.log("FILES", JSON.stringify(this.file_info.files, null, 2));
this.file_info_by_filename = {}
for(var i = 0; i < this.file_info.files.length; i++) {
var f = this.file_info.files[i];
this.file_info_by_filename[f.filename.replace(/.*\//, "")] = f;
}
}
//console.log(dir, catalogs[0]);
catalog_path = path.join(dir, catalogs[0]);
this.workbook = XLSX.readFile(catalog_path); //First one found only
sheet_names = this.workbook.SheetNames;
for (var i=0; i < sheet_names.length; i++ ) {
sheet_name = sheet_names[i];
var sheet = this.workbook.Sheets[sheet_name];
//console.log(sheet);
if (sheet_name == "Collection") {
get_collection_metadata(this.workbook, this);
} else if (sheet_name == "Files" && this.file_info ) {
var header_array = XLSX.utils.sheet_to_csv(sheet, options={"header": false});
//console.log("HEADER ARRAY", header_array.split("\n")[0].split(","));
var header = header_array.split("\n")[0].split(",");
//console.log(header);
sheet_json = XLSX.utils.sheet_to_json(sheet);
//sheet_json = XLSX.utils.sheet_to_json(this.workbook.Sheets['Files']);
//console.log("SHEET JSON ORIGINAL", sheet_json);
sheet_json.forEach(function(row) {
var f = row["FILE:Filename"];
if(f) {
if (items.includes(f)) {
items = items.filter(function(e) { return e !== f })
} else {
row["*MISSING-FILE*"] = "1"
}
}
});
// items now only contains new files so add them
items.forEach(function(f){
sheet_json.push({"FILE:Filename": f});
})
// Iterate over items and add files
//console.log("SHEET_JSON UPDATED", sheet_json);
//console.log(sheet);
this.workbook.Sheets['Files'] = XLSX.utils.json_to_sheet(sheet_json, options ={"header": header});
XLSX.writeFile(this.workbook, catalog_path);
//console.log(XLSX.utils.sheet_to_json(this.workbook.Sheets['Files']));
get_metadata(this.workbook, this, "Files");
// Write back
}
else {
get_metadata(this.workbook, this, sheet_name);
}
//console.log("COLLECTION METADATA:", this.collection_metadata);
}
}
var subdirs = fs.readdirSync(dir).filter(item => (fs.lstatSync(path.join(dir, item)).isDirectory() && !(item.match(ignore)) ));
//console.log("Subdirs", subdirs)
if (subdirs.length > 0) {
for (var i=0; i < subdirs.length; i++) {
//console.log("Looking at subdirs", subdirs[i]);
child = new module.exports();
if (child.read(path.join(dir, subdirs[i]), path.join(this.rel_path, subdirs[i]), this));
this.existing_catalogs = this.existing_catalogs.concat(child.existing_catalogs);
this.children.push(child);
}
//console.log("NAMES HERE", this.name_lookup);
}
}
}
}
}
const builder = require('xmlbuilder');
var fs = require('fs');
module.exports = function(){
return(
{
make_citation : function make_citation(json_path, out_path) {
/* Return a datacite citation in XML format """
# Check we have the metadata we need as per DataCrate spec
To generate DataCite.xml a DataCrate MUST have the following properties
on the DataCrate Microdocument for the schema:Dataset level:
* A [schema:identifier] for the DataCrate which is a DOI URL.
* At least one [schema:creator] with a [schema:givenName] and [schema:familyName].
* At least one [schema:name] (which maps to a DataCite title).
* At least one [schema:publisher] property which SHOULD be an organization but may be a String
value.
* A [schema:datePublished] property in [ISO 8601 Date Format].
"""
*/
//console.log(Object.keys(this.json_by_url));
var crate_data = require(json_path);
this.json_by_id = {};
this.json_by_url = {};
this.json_by_type = {};
graph = crate_data["@graph"];
for (let i = 0; i < graph.length ; i++) {
var item = graph[i];
if (item['@id']){
this.json_by_id[item['@id']] = item;
}
if (item['path']){
this.json_by_url[item['path']] = item;
}
if (item['@type']) {
if (!this.json_by_type[item['@type']]) {
this.json_by_type[item['@type']] = [];
}
this.json_by_type[item['@type']].push(item);
}
}
//console.log("THIS JSON BY URL", this.json_by_url)
var root = this.json_by_url["./"];
var can_cite = true;
var report = "";
const ns = "http://datacite.org/schema/kernel-4"
var xml = builder.create('resource', { encoding: 'utf-8' })
xml.att('xmlns', ns);
xml.att('schemaLocation', 'http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd')
//console.log(xml.end({ pretty: true }));
// Look for creators
var creators_strings = [];
if (root["creator"]){
var creators = root["creator"];
if (!Array.isArray(creators)){
creators = [creators]
}
var creators_el = xml.ele('creators');
//console.log(xml.end({ pretty: true }));
for (var i = 0; i < creators.length; i++) {
var found_creator = false;
var creator_names = "";
var creator = this.json_by_id[creators[i]["@id"]];
var creator_el;
//console.log("Looking at creator", creators[i])
if (creator["familyName"] && creator["givenName"]) {
//console.log("Got names");
creator_el = creators_el.ele("creator");
creator_names = creator["familyName"] + ", " + creator["givenName"];
creator_el.ele("creatorName", creator_names) ;
creator_el.ele('givenName', creator["givenName"]);
creator_el.ele('familyName', creator["familyName"]);
found_creator = true;
creators_strings.push(creator_names);
} else if (creator["name"]) {
//console.log("Got a name");
creator_el = creators_el.ele("creator").ele("creatorName", creator["name"]);
creators_strings.push(creator["name"]);
creator_names = creator["name"];
found_creator = true;
}
if (found_creator && creator["@id"] && creator["@id"].match("https?://orcid.org/")) {
var name_id_el = creator_el.ele("nameIdentifier", creator["@id"]);
name_id_el.att("schemeURI","http://orcid.org");
name_id_el.att("nameIdentifierScheme", "ORCID");
}
}
}
if (creators_strings.length === 0) {
can_cite = false;
report += "Data citations requires * At least one [schema:creator] with a [schema:givenName] and [schema:familyName]."
}
if (root["@id"]){
var identifier = root["@id"];
if (identifier.startsWith('http://dx.doi.org/10.')) {
//<identifier identifierType="DOI">10.5072/example-full</identifier>
id_el = xml.ele("identifier", identifier.replace("http://dx.doi.org/",""))
id_el. att("identifierType", "DOI");
}
else {
can_cite = false;
}
}
else
{
report += "There is no Identifier";
can_cite = false;
}
var name = "";
if (root["name"]){
/*
<titles>
<title xml:lang="en-us">Full DataCite XML Example</title>
<title xml:lang="en-us" titleType="Subtitle">
Demonstration of DataCite Properties.</title>
</titles>
*/
var titles_el = xml.ele("titles");
titles_el.ele("title", root["name"]);
name = root["name"];
}
else {
can_cite = false;
report += "Data Citation requires at least one [schema:name] (Title) (which maps to a DataCite title).";
}
if (root["publisher"]) {
/*
<publisher>DataCite</publisher>
*/
var publisher = root["publisher"]
if (publisher["@id"] && this.json_by_id[publisher["@id"]] && this.json_by_id[publisher["@id"]].name){
publisher = this.json_by_id[publisher["@id"]].name
}
xml.ele("publisher", publisher);
}
else {
can_cite = false;
report += "At least one [schema:publisher] property which SHOULD be an organization but may be a String";
}
if (root["datePublished"] && root["datePublished"].match(/^\d\d\d\d/)) {
//<publicationYear>2014</publicationYear>
published = root["datePublished"].slice(0,4);
var date_published_el = xml.ele("publicationYear", published);
}
else {
can_cite = false;
report += "A [schema:datePublished] property in [ISO 8601 Date Format]";
}
xml.ele("resourceType","DataCrate v0.1").att("resourceTypeGeneral", "Dataset");
this.citation = "";
if (can_cite){
//Datacite text citation:
//Creator (PublicationYear): Title. Version. Publisher. ResourceType. Identifier
//console.log(creators_strings);
this.citation += creators_strings.join(", ");
this.citation += ` (${published}) `;
this.citation += `${name}. `;
this.citation += `${publisher}. `;
this.citation += "Datacrate. ";
this.citation += identifier;
//console.log("REPORT" + report);
//console.log("CITATION", this.citation);
//console.log(xml.end({ pretty: true }));
} else {
console.log("CAN'T CITE", report);
this.citation = report;
}
return this.citation;
},
}
)}
#!/usr/bin/env node
console.log('Hello, world!');
var fs = require('fs');
var program = require('commander');
var XLSX = require('xlsx');
var path = require('path');
const context = require("./defaults/context.json");
const Collection = require("./collection.js")
program
.arguments('<dir>')
.option('-r, --recursive', 'Process sub-directories')
.action(function(dir){
var collection = new Collection;
collection.read(dir, program, is_root = true);
console.log("METADATA!!!!!!!!!!!!!!!", collection.collection_metadata)
})
.parse(process.argv);
const builder = require('xmlbuilder');
var fs = require('fs');
var ejs = require('ejs');
module.exports = function(){
return(
{
write_html : function write_html(template_path, out_path, citation_text) {
//
var temp = fs.readFileSync(template_path, { encoding: 'utf8' });
var template = ejs.compile(temp);
this.html = template({html: this.html_el.end({ pretty: true }), citation: citation_text});
if (out_path) {
fs.writeFileSync(out_path, this.html);
}
},
items_to_html : function items_to_html(node, html, toc) {
var table_el = html.ele("table");
// Find all the keys
keys = []
for (let [_, f] of Object.entries(node)) {
keys = keys.concat(Object.keys(f));
}
key_set = new Set(keys);
key_set.delete('@type');
key_set.delete('identifier');
key_set.delete('fileFormat');
var header_row_el = table_el.ele("tr");
for (let k of key_set) {
header_row_el.ele("th", k[0].toUpperCase() + k.substring(1));
}