merge

justin830827 · justin830827 · commit 12a6b9cb4dd5 · 2019-12-02T13:41:34.000-05:00
diff --git a/README.md b/README.md
@@ -29,9 +29,124 @@ Once the path is under `capstone`, please run the command in following format.
 python3 main.py
 ```
 
-After the program complete, the results of program will display on terminal and result plots will be saved in `results`.
+After the program complete, the results of program will display on terminal and plots will be saved in `./results`.
+
+## Description of Dataset
+
+This dataset is downloaded from [link](http://multilayer.it.uu.se/datasets.html). In this graph, the multiple layers represent relationships between 61 employees of a University department in five different aspects: (i) coworking, (ii) having lunch together, (iii) Facebook friendship, (iv) offline friendship (having fun together), and (v) coauthor-ship.
+
+#### Dataset Name: AUCS
+
+#### Type: Multi-layers Graph
+
+#### Layers
+
+1. Facebook,UNDIRECTED
+2. Lunch,UNDIRECTED
+3. Coauthor,UNDIRECTED
+4. Leisure,UNDIRECTED
+5. Work,UNDIRECTED
+
+#### ACTOR ATTRIBUTES
+
+1. ResearchGroup,STRING
+2. Role,STRING
+
+## Results
+
+```console
+--------------------Load multilayers graph--------------------
+
+Graph: lunch
+        Number of nodes: 55
+        Number of edges: 176
+
+Graph: facebook
+        Number of nodes: 55
+        Number of edges: 116
+
+Graph: leisure
+        Number of nodes: 55
+        Number of edges: 88
+
+Graph: work
+        Number of nodes: 55
+        Number of edges: 155
+
+Graph: coauthor
+        Number of nodes: 55
+        Number of edges: 21
+--------------------Perform alpha selection-------------------
+
+Alpha = 0.2
+        Density = 0.06324630230880231
+        NMI = 0.28437039334841613
+
+Alpha = 0.3
+        Density = 0.05426587301587302
+        NMI = 0.22851191671984766
+
+Alpha = 0.4
+        Density = 0.074259768009768
+        NMI = 0.2724979205931001
+
+Alpha = 0.5
+        Density = 0.0838045634920635
+        NMI = 0.24702647831111396
+
+Alpha = 0.6
+        Density = 0.05803571428571429
+        NMI = 0.24631830263834753
+
+Alpha = 0.7
+        Density = 0.057311958874458876
+        NMI = 0.26013012383823736
+
+Alpha = 0.8
+        Density = 0.059573412698412695
+        NMI = 0.27394942614468387
+
+Alpha = 0.9
+        Density = 0.06098935786435787
+        NMI = 0.2531115624498492
+
+Alpha = 1.0
+        Density = 0.08007756132756133
+        NMI = 0.2515334583668016
+--------------------Multilayer Result--------------------
+NMI: 0.28437039334841613
+Purity: 0.34545454545454546
+
+--------------------Single layer Result--------------------
+
+Layer: lunch
+        NMI: 0.4078232316115382
+        Purity: 0.4909090909090909
+
+Layer: facebook
+        NMI: 0.23710067652109873
+        Purity: 0.2909090909090909
+
+Layer: leisure
+        NMI: 0.36515019997995346
+        Purity: 0.45454545454545453
+
+Layer: work
+        NMI: 0.3601914640378153
+        Purity: 0.4727272727272727
+
+Layer: coauthor
+        NMI: 0.30730821666442587
+        Purity: 0.4
+```
 
 ## Project Member:
 
 1. Wen-Han Hu (whu24)
 2. Yang-Kai Chou (ychou3)
+
+## Reference
+
+1. Dong, Xiaowen, et al. "Clustering on multi-layer graphs via subspace analysis on Grassmann manifolds." IEEE Transactions on signal processing 62.4 (2013): 905-918.
+2. Kim, Jungeun, and Jae-Gil Lee. "Community detection in multi-layer graphs: A survey." ACM SIGMOD Record 44.3 (2015): 37-48.
+3. Zhang, Pan. "Evaluating accuracy of community detection using the relative normalized mutual information." Journal of Statistical Mechanics: Theory and Experiment 2015.11 (2015): P11006.
diff --git a/algo.py b/algo.py
@@ -50,5 +50,14 @@ def SCML(G, k, alpha):
     kmeans = KMeans(init='k-means++', n_clusters=k,
                     n_init=30, random_state=1).fit(U)
     labels = kmeans.predict(U)
-    sse = kmeans.inertia_
-    return labels, U, sse
+    return labels
+
+
+def onelayer(g, k):
+    lap = nx.normalized_laplacian_matrix(g)
+    U = getU(lap, k).real.todense()
+    U = preprocessing.normalize(U, axis=1, norm='l1')
+    kmeans = KMeans(init='k-means++', n_clusters=k,
+                    n_init=30, random_state=1).fit(U)
+    labels = kmeans.predict(U)
+    return labels
diff --git a/main.py b/main.py
@@ -1,9 +1,19 @@
 import networkx as nx
 import numpy as np
 from algo import SCML
+from algo import onelayer
 import matplotlib.pyplot as plt
 from collections import Counter
-from sklearn.metrics import *
+from sklearn.metrics import v_measure_score as nmi
+from sklearn.metrics import silhouette_score
+from sklearn import metrics
+
+
+def purity_score(y_true, y_pred):
+    # compute contingency matrix (also called confusion matrix)
+    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
+    # return purity
+    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
 
 
 def init_graph():
@@ -12,10 +22,28 @@ def init_graph():
     with open(path) as f:
         for line in f:
             line = line.strip().split(',')
-            g.add_node(line[0])
+            if line[1] == 'NA':
+                continue
+            else:
+                g.add_node(line[0])
     return g
 
 
+def get_truth():
+    truth = []
+    na_list = []
+    path = './data/aucs_nodelist.txt'
+    with open(path) as f:
+        for line in f:
+            line = line.strip().split(',')
+            t = line[1]
+            if t == 'NA':
+                na_list.append(line[0])
+            else:
+                truth.append(int(t[-1])-1)
+    return truth, na_list
+
+
 def plot_elbow(x, y, name):
     plt.figure(figsize=(8, 8))
     plt.plot(x, y, '-o')
@@ -41,7 +69,7 @@ def get_partition(labels, nodes):
 
 def get_score(graph_list, partitions):
     density = np.zeros((len(graph_list), len(partitions)))
-    conductance = np.zeros((len(graph_list), len(partitions)))
+    # conductance = np.zeros((len(graph_list), len(partitions)))
     for i, g in enumerate(graph_list):
         for k in range(len(partitions)):
             g_sub = g.subgraph(partitions[k])
@@ -71,64 +99,76 @@ def main():
         'work': work,
         'coauthor': coauthor,
     }
+    truth, na = get_truth()
 
     # Load data into graph
     print("--------------------------------------------------Load multilayers graph--------------------------------------------------")
     with open(path) as f:
         for line in f:
             line = line.strip().split(',')
             name = line[2]
-            table[name].add_edge(line[0], line[1])
+            if line[0] in na or line[1] in na:
+                continue
+            else:
+                table[name].add_edge(line[0], line[1])
     for name, graph in table.items():
         print("\nGraph: {}".format(name))
         print("\tNumber of nodes: {}".format(nx.number_of_nodes(graph)))
         print("\tNumber of edges: {}".format(nx.number_of_edges(graph)))
 
-    graph_list = [lunch, facebook, leisure, work, coauthor]
+    graph_list = [lunch, work, coauthor, leisure]
     node_list = list(lunch.nodes)
 
-    # Tunning k
-    print("--------------------------------------------------Perform k clusters selection--------------------------------------------------")
-    sse_list = []
-    range_k = np.arange(2, 15)
-    for k in range_k:
-        labels, matrix, sse = SCML(graph_list, k, 0.5)
-        score = silhouette_score(matrix, labels, random_state=42)
-        print("Number of clusters k = {}".format(k),
-              ",Silhouette Score = {}".format(round(score, 5)))
-        sse_list.append(sse)
+    # # Tunning k
+    # print("--------------------------------------------------Perform k clusters selection--------------------------------------------------")
+    # sse_list = []
+    # range_k = np.arange(2, 15)
+    # for k in range_k:
+    #     labels, sse = SCML(graph_list, k, 0.5)
+    #     score = silhouette_score(matrix, labels, random_state=42)
+    #     print("Number of clusters k = {}".format(k),
+    #           ",Silhouette Score = {}".format(round(score, 5)))
+    #     sse_list.append(sse)
 
-    # Plot elbow method for k
-    plot_elbow(range_k, sse_list, "Selection of k")
+    # # Plot elbow method for k
+    # plot_elbow(range_k, sse_list, "Selection of k")
 
     # Tunning alpha
     print("--------------------------------------------------Perform alpha selection--------------------------------------------------")
     range_a = np.arange(0.2, 1.1, 0.1)
     den = []
+    nmi_list = []
     for alpha in range_a:
-        labels, matrix, sse = SCML(graph_list, 8, alpha)
+        labels = SCML(graph_list, 8, alpha)
         partitions = get_partition(labels, node_list)
         density = get_score(graph_list, partitions)
         den.append(density)
-        print("Alpha = {}".format(round(alpha, 1)),
-              ", Density = {}".format(density))
+        print("\nAlpha = {}".format(round(alpha, 1)))
+        print("\tDensity = {}".format(density))
+        nmi_value = nmi(truth, labels)
+        print("\tNMI = {}".format(nmi_value))
+        nmi_list.append(nmi_value)
 
     # Plot elbow method for alpha
-    plot_elbow(range_a, den, "Selection of alpha")
+    plot_elbow(range_a, den, "Selection of alpha (Density)")
+    plot_elbow(range_a, nmi_list, "Selection of alpha (NMI)")
 
     # Select the best model
-    labels, matrix, sse = SCML(graph_list, 8, 0.5)
+    print("--------------------------------------------------Multilayer Result---------------------------------------------------")
+    labels = SCML(graph_list, 8, 0.2)
     partitions = get_partition(labels, node_list)
-    get_score(graph_list, partitions)
-
-    # Evaluation of clustring
-    # print("--------------------------------------------------Clustering evaluation--------------------------------------------------")
-    # db_index = round(davies_bouldin_score(matrix, labels), 5)
-    # ch_index = round(calinski_harabasz_score(matrix, labels), 5)
-    # s_coef = round(silhouette_score(matrix, labels, metric='euclidean'), 5)
-    # print("Silhouette Score: {}".format(s_coef))
-    # print("Davies-Bouldin Score: {}".format(db_index))
-    # print("Calinski-Harabaz Score: {}".format(ch_index))
+
+    print("NMI: {}".format(nmi(truth, labels)))
+    purity = purity_score(truth, labels)
+    print("Purity: {}".format(purity))
+    print("--------------------------------------------------Single layer Result--------------------------------------------------")
+    for name, g in table.items():
+        print("\nLayer: {}".format(name))
+        labels = onelayer(g, 8)
+        # print(labels)
+        print("\tNMI: {}".format(nmi(truth, labels)))
+        purity = purity_score(truth, labels)
+        print("\tPurity: {}".format(purity))
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ networkx
 numpy
 scipy
 sklearn
+matplotlib
diff --git a/results/Selection of alpha.png b/results/Selection of alpha.png
diff --git a/results/Selection of k.png b/results/Selection of k.png

-Original file line number
+Diff line change
 numpy
 scipy
 sklearn
 +matplotlib