Rewriting the indexing config so that it's TYPE/FIELD/{config} and faceting

and resolving items are unified. Have rejigged filtering and the filter tests are passing. Added a test case for empty filters which let everything through.

Rewriting the indexing config so that it's TYPE/FIELD/{config} and faceting
and resolving items are unified. Have rejigged filtering and the filter tests are passing. Added a test case for empty filters which let everything through.
1f5b7497 · Mike Lynch · 4575dca1 · 1f5b7497 · 1f5b7497 · 1f5b7497
Commit 1f5b7497 authored 5 years ago by Mike Lynch
--- a/lib/CatalogSolr.js
+++ b/lib/CatalogSolr.js
@@ -19,19 +19,16 @@ class CatalogSolr {

  setConfig(config) {
    this.config = config;
-    // compile filters
-
-    // if( !this.validateConfig() ) {
-    //   return null;
-    // }
-
    this.filters = {};
-    _.each(this.config['types'], ( cf, type ) => {
-      if( 'filter' in cf ) {
-        this.filters[type] = this.compileFilter(cf['filter'])
-      } else {
-        this.filters[type] = () => true;
-      }
+
+    _.each(this.config['types'], ( typecf, type ) => {
+      const typef = {};
+      _.each(typecf, ( fieldcf, field ) => {
+        if( 'filter' in fieldcf ) {
+          typef[field] = fieldcf['filter'];
+        }
+      });
+      this.filters[type] = this.compileFilter(typef);
    });
  }

@@ -256,30 +253,50 @@ class CatalogSolr {
    const cfTypes = this.config['types'];
    const solrDocument = {};

+    // loop through each item in the JSON-LD @graph
+
    for ( const item of crate.graph ) {
      const type = item['@type'];
      if( type in cfTypes ) {
+        // get config for this type of item
        const cf = cfTypes[item['@type']];
-        // filters are compiled from config when it's loaded
-        if( this.filters[type](item) ) {

+        // test if item passes the filters (these were compiled when the
+        // config was loaded)
+
+        if( this.filters[type](item) ) {
+          // start with the fields which have configured mappings for all items
          const solr = this.baseSolr(cfMap, item);
+          const skip = cf['skip'] || [];

+          // then map the rest, unless they're in 'skip'
          _.each(item, ( value, field ) => {
-            if( !(field in cfMap) ) {
-              if( cf.flatten && cf.flatten[field] ) {
-                solr[field] = this.flatten(cf.flatten[field], value);
+            if( !(field in cfMap) && !skip.includes(field) ) {
+
+              // 'resolve' contains fields where we need to look up
+              // item(s) in the graph to get information we want in the
+              // index (like a person or an organisation's name)
+              if( cf.resolve && cf.resolve[field] ) {
+                solr[field] = this.resolveValues(crate, cf.resolve[field], value);
              } else {
-                solr[field] = value;
+                solr[field] = this.unwrap(value);
              }
            }
          });

-          // TODO - add facets
+          // A separate loop to look up and add facets, because we may
+          // need to do different stuff with them even if we've already looked
+          // them up in the main loop (id lookups in ROCrate are cheap)
+
+          _.each(cf['facets'], ( facetcf, facet ) => {
+            const facetField = facet + facetcf['field_suffix'];
+            console.log(`resolving facets: ${facet}`);
+            solr[facetField] = this.resolveFacets(crate, facetcf, item[facet]);
+          });
+
          if( !(type in solrDocument) ) {
            solrDocument[type] = [];
          }
-          console.log(JSON.stringify(solrDocument))
          solrDocument[type].push(solr)
        }
      }
@@ -290,6 +307,64 @@ class CatalogSolr {
  }


+  resolveValues(crate, cf, value) {
+    if( cf === 'multi' ) {
+      return value.map((v) => this.resolveAndFlatten(crate, v));
+    } else {
+      return this.resolveAndflatten(graph, value);
+    }
+  }
+
+
+  resolveAndFlatten(crate, value) {
+    if( !('@id' in value ) ) {
+      return "no @id in value";
+    }
+    const resolved = crate.getItem(value['@id']);
+    if( !resolved ) {
+      return `@id ${value['@id']} not found`;
+    }
+    return JSON.stringify(resolved).replace(/"/g, '\'');    
+  }
+
+
+  // this could be merged with resolveValues, but perhaps
+  // is better kept separate. I think that the facets need
+  // to be made available in flattened form, but I don't
+  // understand the logic of the old code, so for now it's
+  // this
+
+  resolveFacets(crate, cf, value) {
+    if( cf['tokenize'] ) {
+      if( value ) {
+        return value.split(cf['tokenize']['delim']);
+      } else {
+        return [];
+      }
+    }
+    if (cf['fieldName']) {
+      if( Array.array(value) ) {
+        return value.map((v) => {
+          const resolved = crate.getItem(v['@id']);
+          if( resolved ) {
+            return resolved[cf['fieldName']]
+          } else {
+            return null;
+          }
+        });
+      } else {
+        return [];
+      }
+    }
+    return value;
+  }
+
+
+
+
+
+
+
  

  // mappings which are done for all solr records
@@ -298,50 +373,24 @@ class CatalogSolr {
    const base = {};
    _.each(map_all, ( targets, field ) => {
      _.each(targets, ( target ) => {
-        base[target] = item[field]
+        base[target] = this.unwrap(item[field])
      });
    });
    return base;
  }


-  // flatten: take a value which is a JSON object and escape it and stringify it
-  // so it can get saved in Solr as a literal, not JSON.
-
-  // this is used when we want the JSON structure to get pushed through to the
-  // UI at the Portal for things like Creator.
-
+  // unwrap a value if it's in an array

-  // if the value in config is 'array', then the input is interpreted as an
-  // array of values, each of which is flattened individually and returned as
-  // an array - this is what happens for eg with creators
-  //
-  // [ { "@id": "creator1" }, { "@id": "creator2" } ] 
-  //
-  // becomes
-  //
-  // [ "{ \'@id\': \'creator1\'}", "{ \'@id\': \'creator2\'}" ]
-  //
-  // if not, the value is flattened as a whole, ie: 
-  //
-  // "[ { \'@id\': \'creator1\'}, { \'@id\': \'creator2\'} ]
-
-  flatten_value(cf, value) {
-    if( cf === 'array' ) {
-      return value.map(flatten);
+  unwrap(value) {
+    if( Array.isArray(value) ) {
+      return value[0];
    } else {
-      return flatten(value);
+      return value;
    }
  }


-  flatten(value) {
-    return JSON.stringify(value).replace(/"/g, '\'')    
-  }
-
-
-
-
  createSolrObject(jsonld, rootConfig) {

    //Peter's idea is to convert everything into an array then it is safer to work to convert

--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -18,14 +18,14 @@
    "axios": "^0.18.1",
    "chai": "^4.2.0",
    "datacrate": "git+https://code.research.uts.edu.au/eresearch/datacrate.git#random_as_library",
-    "fs-extra": "^8.0.1",
+    "fs-extra": "^8.1.0",
    "lodash": "^4.17.15",
    "ocfl": "^1.0.3",
    "random-word": "^2.0.0",
-    "ro-crate": "^1.0.1",
-    "yargs": "^13.2.4"
+    "ro-crate": "^1.1.0",
+    "yargs": "^13.3.0"
  },
  "devDependencies": {
-    "mocha": "^6.1.4"
+    "mocha": "^6.2.0"
  }
 }
--- a/test-data/fields.json
+++ b/test-data/fields.json
@@ -5,63 +5,40 @@
  },

  "types": {
+
    "Dataset": {
-      "filter": { "path": { "re": "^\\./|data/$"} },
-      "recordTypeName": "dataset",
-      "flatten": {
-        "creator": "array",
-        "GeoCoordinates": {}
+      "path": {
+        "filter": { "re": "^\\./|data/$" }
+      },
+
+      "hasPart": "skip",
+
+      "creator": {
+        "resolve": "multi",
+        "facet": true
      },
-      "facets": {
-        "datePublished": {
-          "field_suffix": "_Dataset_facet",
-          "trim": true
-        },
-        "keywords": {
-          "trim": true,
-          "field_suffix": "_Dataset_facetmulti",
-          "tokenize": {
-            "delim": ","
-          }
-        },
-        "creator": {
-          "trim": true,
-          "fieldName": "name",
-          "field_suffix": "_Dataset_facetmulti"
-        },
-        "publisher": {
-          "trim": true,
-          "fieldName": "@id",
-          "field_suffix": "_Dataset_facet"
-        },
-        "type": {
-          "trim": true,
-          "field_suffix": "facetmulti",
-          "skip_entry_type_suffix": true,
-          "escape_value": "solr_field"
+      "keywords": {
+        "facet": {
+          "tokenize": ","
        }
      },
-      "oai-pmh": {
-        "core": "dataset",
-        "set": "datasource:ReDBox"
+      "datePublished": {
+        "facet": true
+      },
+      "publisher": {
+        "resolve": "single",
+        "facet": {
+          "field": "@id"
+        }
      }
    },
+
    "Person": {
-      "reverse": "in the reverse jsonld pointer",
-      "core": "dataset",
-      "format": "jsonld",
-      "recordTypeName": "person",
-      "flatten": {
-        "affiliation": "array"
+
+      "affiliation": {
+        "resolve": "multi"
      },
-      "facets": {
-        "type": {
-          "trim": true,
-          "field_suffix": "facetmulti",
-          "skip_entry_type_suffix": true,
-          "escape_value": "solr_field"
-        }
-      }
+
    }
  }
 }
\ No newline at end of file
--- a/test/filters.spec.js
+++ b/test/filters.spec.js
@@ -24,14 +24,12 @@ function randomGraph(n, type, fields, value_callback) {
  });
 }

-function makeCatalog(spec) {
+function makeCatalog(fieldcf) {
  const catalog = new CatalogSolr();
  
  catalog.setConfig({
    types: {
-      Dataset: {
-        filter: spec
-      }
+      Dataset: fieldcf
    }
  });

@@ -50,12 +48,23 @@ function randomSubstring(word) {

 describe('single filters', function () {
  this.timeout(TIMEOUT);
+
+  it('matches everything when filter is empty', function () {
+    const graph = randomGraph(GRAPH, 'Dataset', ['path']);
+    const catalog = makeCatalog({path: {} });
+    const matches = graph.filter(catalog.filters['Dataset']);
+    expect(matches).to.be.an('array').and.to.have.lengthOf(GRAPH);
+  });
+
+
+
+
  it('can pick items by exact matching a single field', function () {
    _.times(REPEATS, () => {
      const graph = randomGraph(GRAPH, 'Dataset', ['path']);
      const item = _.sample(graph);
      const lookfor = item['path'][0];
-      const catalog = makeCatalog({path: lookfor});
+      const catalog = makeCatalog({path: {filter: lookfor} });
      const matches = graph.filter(catalog.filters['Dataset']);
      expect(matches).to.be.an('array').and.to.not.be.empty;
      _.each(matches, (match) => {
@@ -70,7 +79,7 @@ describe('single filters', function () {
      const graph = randomGraph(GRAPH, 'Dataset', ['path']);
      const item = _.sample(graph);
      const lookfor = randomSubstring(item['path'][0]);
-      const catalog = makeCatalog({ path: { re: lookfor } } );
+      const catalog = makeCatalog({ path: { filter: { re: lookfor } } });
      const matches = graph.filter(catalog.filters['Dataset']);
      expect(matches).to.be.an('array').and.to.not.be.empty;
      const lookfor_re = new RegExp(lookfor);
@@ -85,7 +94,7 @@ describe('single filters', function () {
    _.times(REPEATS, () => {
      const graph = randomGraph(GRAPH, 'Dataset', ['path'], () => [ _.sample(['./', 'data/'])] );
      const lookfor = "^\\./|data/$";
-      const catalog = makeCatalog({path: { re: lookfor } } );
+      const catalog = makeCatalog({path: { filter: { re: lookfor } } } );
      const matches = graph.filter(catalog.filters['Dataset']);
      expect(matches).to.be.an('array').and.to.have.lengthOf(GRAPH);
      const lookfor_re = new RegExp(lookfor);
@@ -103,7 +112,7 @@ describe('single filters', function () {
      const lookfor = item['path'][0];
      // add more items without a path
      const graph2 = graph.concat(randomGraph(GRAPH, 'Dataset', ['name']));
-      const catalog = makeCatalog({path: { re: lookfor } } );
+      const catalog = makeCatalog({path: {filter: { re: lookfor } }});
      const matches = graph2.filter(catalog.filters['Dataset']);
      const lookfor_re = new RegExp(lookfor);
      _.each(matches, (match) => {
@@ -117,7 +126,7 @@ describe('single filters', function () {
  it('can filter values whether or not they are in arrays', function () {
    const values = [ [ 'one' ], [ 'two' ], 'one', 'two' ];
    const graph = values.map((v) => { return { '@type': 'Dataset', 'path': v }});
-    const catalog = makeCatalog({path: 'one' });
+    const catalog = makeCatalog({path: {filter: 'one' }});
    const matches = graph.filter(catalog.filters['Dataset']);
    expect(matches).to.have.lengthOf(2);
    _.each(matches, (match) => {
@@ -143,15 +152,15 @@ describe('single filters', function () {
 function randomFilter(fields, item) {
  const n = _.random(2, fields.length);
  const ffields =_.sampleSize(fields, n);
-  const filter = {};
+  const filters = {};
  _.each(ffields, (ff) => {
    if( _.random(1) === 0 ) {
-      filter[ff] = item[ff][0]
+      filters[ff] = { filter: item[ff][0] }
    } else {
-      filter[ff] = { re: randomSubstring(item[ff][0]) }
+      filters[ff] = { filter: { re: randomSubstring(item[ff][0]) } }
    }
  });
-  return filter;
+  return filters;
 }

 describe('multiple filters', function () {

--- a/test/ro-crate.spec.js
+++ b/test/ro-crate.spec.js
 const assert = require('assert');
+const expect = require('chai').expect;
 const _ = require('lodash');
 const path = require('path');
 const fs = require('fs-extra');
@@ -18,6 +19,8 @@ function jsonRecord(basePath, fileName) {
  }
 }

+// TODO: have this actually test a dataset and some people
+
 describe('create solr object', function () {
  const test_data = path.join(process.cwd(), 'test-data');
  const fieldsPath = path.join(test_data, 'fields.json');
@@ -33,6 +36,6 @@ describe('create solr object', function () {

    fs.writeFileSync(path.join(test_data, "solr_output.json"), JSON.stringify(solrObject, null, 2));

-    assert.strictEqual(solrObject['Dataset'][0]['record_format_s'], 'Dataset','Dataset not loaded');
+    expect(solrObject['Dataset'][0]['record_type_s']).to.equal('Dataset');
  });
 });
\ No newline at end of file