Commit 0045094d authored by PTSEFTON's avatar PTSEFTON
Browse files

Added GTM test data in as files rather than a referenced repo

parent b286ff5f
GTM @ 132249b4
Subproject commit 132249b4d9783ffceca13af23e189717d317b236
<html>
<head>
<style>
table {
width: 90%;
text-align: left;
vertical-align: top;
margin-bottom: 2em;
}
td {
margin-right: 2em;
white-space: normal;
}
details {
margin-left: 2em;
}
.break {
word-break: break-all;
}
div.collection {
margin-left: +1em;
}
</style>
<script type="application/ld+json">
{
"@id": "https://doi.org/10.4225/59/59e3d6d08faa9",
"@type": [
"Dataset"
],
"contact": [
{
"@id": "http://orcid.org/0000-0002-8367-6908"
}
],
"citation": [
{
"@id": "http://dx.doi.org/10.1109/TCYB.2014.2386282"
}
],
"path": [
"./"
],
"creator": [
{
"@id": "http://orcid.org/0000-0002-8367-6908"
},
{
"@id": "http://orcid.org/0000-0003-0690-4732"
},
{
"@id": "http://orcid.org/0000-0003-3960-0583"
},
{
"@id": "https://orcid.org/0000-0002-6953-3986"
}
],
"datePublished": [
"2018-03-10"
],
"description": [
"This demo is the sampling inference for Graph Topic Model, and more details about this model can be found in the following reference: @ARTICLE{7015568, author={J. Xuan and J. Lu and G. Zhang and X. Luo}, journal={IEEE Transactions on Cybernetics}, title={Topic Model for Graph Mining}, year={2015}, volume={45}, A Markov chain Monte Carlo (MCMC) algorithm is developed and implemented to inference the Graph Topic Model (GTM). GTM is a probabilistic graphical model for the data represented by graph structure, e.g., chemical formulas or documents."
],
"hasPart": [
{
"@id": "gtm_gs.m"
},
{
"@id": "LICENSE"
},
{
"@id": "README.txt"
},
{
"@id": "test_gtm.m"
}
],
"identifier": [
"https://doi.org/10.4225/59/59e3d6d08faa9"
],
"license": [
{
"@id": "https://www.gnu.org/licenses/gpl-3.0.en.html"
}
],
"name": [
"GTM"
],
"publisher": [
"University of Technology Sydney"
]
}
</script>
<link rel="stylesheet"
href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css"
integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u"
crossorigin="anonymous"/>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
<meta charset='utf-8'/>
</head>
<body>
<nav class="navbar navbar-inverse">
<ul class="nav navbar-nav" >
<li >
<a href="" class="active"><button type="button" class="btn btn-default btn-sm"><span class="glyphicon glyphicon-home"></span>&nbsp;GTM</button></a></li>
</ul>
</nav>
<div class="container">
<div class="jumbotron">
<h3>GTM</h3>
<h4></h4>
</div>
<p>A machine-readable version of this page, created at 2018-09-06T23:50:26.371Z is available <a href='CATALOG.json'>CATALOG.json</a></p>
<table class = 'table' id = 'https://doi.org/10.4225/59/59e3d6d08faa9'
><hr
><tr
><th style = 'white-space: nowrap; width: 1%;'
>@id</th
><td
><a href = 'https://doi.org/10.4225/59/59e3d6d08faa9' class = 'fa fa-external-link' title = 'GTM'
>https://doi.org/10.4225/59/59e3d6d08faa9</a
></td
></tr
><tr
><th
><span
>name<sup
><a href = 'http://schema.org/name' title = 'Definition of: name'
>?</a
></sup
></span
></th
><td
><b
>GTM</b
></td
></tr
><tr
><th
>@type</th
><td
><span
>Dataset<sup
><a href = 'http://schema.org/Dataset' title = 'Definition of: Dataset'
>?</a
></sup
></span
></td
></tr
><tr
><th
><span
>description<sup
><a href = 'http://schema.org/description' title = 'Definition of: description'
>?</a
></sup
></span
></th
><td
>This demo is the sampling inference for Graph Topic Model, and more details about this model can be found in the following reference: @ARTICLE{7015568, author={J. Xuan and J. Lu and G. Zhang and X. Luo}, journal={IEEE Transactions on Cybernetics}, title={Topic Model for Graph Mining}, year={2015}, volume={45}, A Markov chain Monte Carlo (MCMC) algorithm is developed and implemented to inference the Graph Topic Model (GTM). GTM is a probabilistic graphical model for the data represented by graph structure, e.g., chemical formulas or documents.</td
></tr
><tr
><th
><span
>datePublished<sup
><a href = 'http://schema.org/datePublished' title = 'Definition of: datePublished'
>?</a
></sup
></span
></th
><td
>2018-03-10</td
></tr
><tr
><th
><span
>creator<sup
><a href = 'http://schema.org/creator' title = 'Definition of: creator'
>?</a
></sup
></span
></th
><td
><ul
><li
><a href = './CATALOG/http:__orcid.org_0000-0002-8367-6908.html'
>J. Xuan</a
></li
><li
><a href = './CATALOG/http:__orcid.org_0000-0003-0690-4732.html'
>J. Lu</a
></li
><li
><a href = './CATALOG/http:__orcid.org_0000-0003-3960-0583.html'
>G. Zhang</a
></li
><li
><a href = './CATALOG/https:__orcid.org_0000-0002-6953-3986.html'
>X. Luo</a
></li
></ul
></td
></tr
><tr
><th
><span
>path<sup
><a href = 'http://schema.org/contentUrl' title = 'Definition of: path'
>?</a
></sup
></span
></th
><td
><a href = './'
>.</a
></td
></tr
><tr
><th
><span
>contact<sup
><a href = 'http://schema.org/accountablePerson' title = 'Definition of: contact'
>?</a
></sup
></span
></th
><td
><a href = './CATALOG/http:__orcid.org_0000-0002-8367-6908.html'
>J. Xuan</a
></td
></tr
><tr
><th
><span
>citation<sup
><a href = 'http://schema.org/citation' title = 'Definition of: citation'
>?</a
></sup
></span
></th
><td
><a href = './CATALOG/http:__dx.doi.org_10.1109_TCYB.2014.2386282.html'
>Topic Model for Graph Mining</a
></td
></tr
><tr
><th
><span
>hasPart<sup
><a href = 'http://schema.org/hasPart' title = 'Definition of: hasPart'
>?</a
></sup
></span
></th
><td
><ul
><li
><a href = './CATALOG/gtm_gs.m.html'
>gtm_gs.m</a
></li
><li
><a href = './CATALOG/LICENSE.html'
>LICENSE</a
></li
><li
><a href = './CATALOG/README.txt.html'
>README.txt</a
></li
><li
><a href = './CATALOG/test_gtm.m.html'
>test_gtm.m</a
></li
></ul
></td
></tr
><tr
><th
><span
>license<sup
><a href = 'http://schema.org/license' title = 'Definition of: license'
>?</a
></sup
></span
></th
><td
><a href = './CATALOG/https:__www.gnu.org_licenses_gpl-3.0.en.html.html'
>GPL 3</a
></td
></tr
><tr
><th
><span
>publisher<sup
><a href = 'http://schema.org/publisher' title = 'Definition of: publisher'
>?</a
></sup
></span
></th
><td
>University of Technology Sydney</td
></tr
></table
>
<p>This file was created at 2018-09-06T23:50:26.381Z by
<a href='https://code.research.uts.edu.au/eresearch/calcytejs'>
Calcyte</a> which implements the <a href='https://github.com/UTS-eResearch/datacrate/blob/master/spec/0.3/data_crate_specification_v0.3.md'>
Draft DataCrate Packaging format</a>, version 0.3
</p>
</body>
</html>
This diff is collapsed.
This diff is collapsed.
This demo is the sampling inference for Graph Topic Model, and more details about this model can be found in the following reference:
@ARTICLE{7015568,
author={J. Xuan and J. Lu and G. Zhang and X. Luo},
journal={IEEE Transactions on Cybernetics},
title={Topic Model for Graph Mining},
year={2015},
volume={45},
number={12},
pages={2792-2803},
keywords={data mining;data structures;graph theory;pattern classification;text analysis;unsupervised learning;Bernoulli distribution;GTM;bag-of-word assumption;classification;edge modeling;graph dataset;graph mining;graph nodes;graph representation;graph supervised learning;graph-structured data;innovative graph topic model;latent Dirichlet allocation;latent topic discovery;unsupervised learning;Chemical elements;Chemicals;Data mining;Data models;Hidden Markov models;Inference algorithms;Vectors;Graph mining;latent Dirichlet allocation (LDA);topic model},
doi={10.1109/TCYB.2014.2386282},
ISSN={2168-2267},
month={Dec},}
% Implementation of GTM by Gibbs sampling
%
%
%
% Input:
% data.G - graph number
% data.N - node vocabulary size
% data.graphs - graph list
% K - topic number
% Output:
%
% GTM - model output
% L - Log likelihood list
%
% Random data for testing:
%
% K = 10;
% G = 100;
% V = 200;
% Ng = randi([1 10], 1, G);
% graphs = cell(G);
%
% for gi = 1 : G
% nodes = randi([1, V], 1, Ng(gi));
% edges = triu(randi([0 1], Ng(gi)));
% edges = edges + triu(edges, 1)';
%
% edges(logical(eye(Ng(gi)))) = Inf;
%
% graph.nodes = nodes;
% graph.edges = edges;
% graphs{gi} = graph;
% end
%
% data.G = G;
% data.V = V;
% data.Ng = Ng;
% data.graphs = graphs;
function [ GTM, L_list ] = gtm_gs( data, K, alpha, beta )
L_list = [];
% initialization
fprintf('----- GTM model initialization........ \n');
[G, V, Ng, graphs, theta, z, phi, phi_norm, GKC, KV, KK] = intialization(data, K, alpha, beta);
% output
phi_expc = zeros(K, V);
theta_expc = zeros(G, K);
samplenum = 0;
% start Gibbs sampling
iter_max = 1000;
burnin = 100;
inteval = 20;
it = 1;
while it <= iter_max
fprintf('----- Gibbs sampling iternum: %d \n', it);
%update z
fprintf('----- update z \n');
tic;
[z, GKC, KV, KK] = Update_z(G, K, Ng, graphs, theta, z, phi, phi_norm, GKC, KV, KK);
fprintf('----- use time: %d \n', toc);
%update theta
fprintf('----- update theta \n');
tic;
theta = Update_theta(GKC, K, alpha);
fprintf('----- use time: %d \n', toc);
%update phi
fprintf('----- update phi \n');
tic;
[phi, phi_norm] = Update_phi(phi, phi_norm, K, V, KV, KK, beta);
fprintf('----- use time: %d \n', toc);
%evaluate loglikehd
fprintf('----- evaluate logLikelihood \n');
tic;
[L] = Evaluate_likehd(phi, phi_norm, K, V, KV, KK, beta);
fprintf('----- L: %d \n', L);
fprintf('----- use time: %d \n', toc);
L_list = [L_list, L];
% collect samples
if it > burnin && rem(it, inteval) == 0
phi_expc = phi_expc + phi;
theta_expc = theta_expc + theta;
samplenum = samplenum + 1;
end
it = it + 1;
end
% output
GTM.phi = phi_expc / samplenum;
GTM.theta = theta_expc / samplenum;
end
function [G, V, Ng, graphs, theta, z, phi, phi_norm, GKC, KV, KK] = intialization(data, K, alpha, beta)
G = data.G;
V = data.V;
graphs = data.graphs;
Ng = data.Ng;
%
GKC = zeros(G, K);
KV = zeros(K, V);
KK = zeros(K, 2, K);
% z
z = cell(1, G);
k_tmp = [];
v_tmp = [];
for gi = 1:G
g = graphs{gi};
nodes_g = g.nodes;
edges_g = g.edges;
z_g = randi([1 K], 1, length(nodes_g));
z{gi} = z_g;
for k = 1 : K
idxz = find(z_g == k);
kc = length(idxz);
GKC(gi, k) = kc;
[~,link_z] = find(edges_g(idxz, :) == 1);
link_k = z_g(link_z);
[~,nolink_z] = find(edges_g(idxz, :) == 0);
nolink_k = z_g(nolink_z);
if ~isempty(link_k)
tab_link = tabulate(link_k);
k_list_tmp = tab_link(:, 1);
k_count_tmp = tab_link(:, 2);
KK(k, 1, k_list_tmp) = reshape(KK(k, 1, k_list_tmp), length(k_list_tmp), 1) + k_count_tmp ;
end
if ~isempty(nolink_k)
tab_nolink = tabulate(nolink_k);
k_list_tmp = tab_nolink(:, 1);
k_count_tmp = tab_nolink(:, 2);
KK(k, 2, k_list_tmp) = reshape(KK(k, 2, k_list_tmp), length(k_list_tmp), 1) + k_count_tmp ;
end
end
k_tmp = [k_tmp z_g];
v_tmp = [v_tmp nodes_g];
end
% theta
theta = gamrnd(alpha * ones(G, K), 1);
theta = theta ./ repmat(sum(theta, 2), 1, K);
% phi
phi = gamrnd(beta * ones(K, V), 1);
phi = phi ./ repmat(sum(phi, 2), 1, V);
phi_norm = zeros(1, K);
for k = 1 : K
v_list = zeros(1, V);
idx = find(k_tmp == k);
tab = tabulate(v_tmp(idx));
v_list(tab(:, 1)) = tab(:, 2);
KV(k, :) = v_list;
phi_norm(k) = norm(phi(k, :));
end
end
function theta = Update_theta(GKC, K, alpha)
theta = gamrnd(GKC+alpha, 1);
theta = theta ./ repmat(sum(theta, 2), 1, K);
end
function [z, GKC, KV, KK] = Update_z(G, K, Ng, graphs, theta, z, phi, phi_norm, GKC, KV, KK)
KK = zeros(K, 2, K);
k_new_ratio = 0;
%zall = sum(Ng);
for gi = 1 : G
z_g = z{gi};
g = graphs{gi};
nodes_g = g.nodes;
edges_g = g.edges;
for zi = 1 : Ng(gi)
k_old = z_g(zi);
v_zi = nodes_g(zi);
pk = log(theta(gi, :) + eps) + transp(log(phi(:, v_zi) + eps));
link_z = find(edges_g(zi, :) == 1);
link_k = z_g(link_z);
if ~isempty(link_k)
pk = pk + transpose(sum(log( (phi * phi(link_k, :)') ./ ( phi_norm' * phi_norm(link_k)) + eps), 2));
end
nolink_z = find(edges_g(zi, :) == 0);
nolink_k = z_g(nolink_z);
if ~isempty(nolink_k)
pk = pk + transpose(sum(log(1- (phi * phi(nolink_k, :)') ./ ( phi_norm' * phi_norm(nolink_k)) + eps), 2));
end
p = 1 ./ sum(exp(repmat(pk, [K 1]) - repmat(pk', [1 K])), 2);
k_new = find(mnrnd(1, p) == 1);
if k_old ~= k_new
k_new_ratio = k_new_ratio + 1;
z_g(zi) = k_new;
GKC(gi, k_old) = GKC(gi, k_old) - 1;
GKC(gi, k_new) = GKC(gi, k_new) + 1;
KV(k_old, v_zi) = KV(k_old, v_zi) - 1;
KV(k_new, v_zi) = KV(k_new, v_zi) + 1;
end
end
z{gi} = z_g;
%
for k = 1 : K
idxz = find(z_g == k);
if ~isempty(idxz)
[~,link_z] = find(edges_g(idxz, :) == 1);
link_k = z_g(link_z);
[~,nolink_z] = find(edges_g(idxz, :) == 0);
nolink_k = z_g(nolink_z);
if ~isempty(link_k)
tab_link = tabulate(link_k);
k_list_tmp = tab_link(:, 1);
k_count_tmp = tab_link(:, 2);
KK(k, 1, k_list_tmp) = reshape(KK(k, 1, k_list_tmp), length(k_list_tmp), 1) + k_count_tmp ;
end