Graphviz  2.41.20171026.1811
clustering.c
Go to the documentation of this file.
1 /* $Id$Revision: */
2 /* vim:set shiftwidth=4 ts=8: */
3 
4 /*************************************************************************
5  * Copyright (c) 2011 AT&T Intellectual Property
6  * All rights reserved. This program and the accompanying materials
7  * are made available under the terms of the Eclipse Public License v1.0
8  * which accompanies this distribution, and is available at
9  * http://www.eclipse.org/legal/epl-v10.html
10  *
11  * Contributors: See CVS logs. Details at http://www.graphviz.org/
12  *************************************************************************/
13 
14 #define STANDALONE
15 #include "general.h"
16 #include "SparseMatrix.h"
17 #include "clustering.h"
18 
19 
20 
21 static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_init(SparseMatrix A, int level){
23  int n = A->n, i, j;
24 
27 
28  if (!A) return NULL;
29  assert(A->m == n);
30  grid = MALLOC(sizeof(struct Multilevel_Modularity_Clustering_struct));
31  grid->level = level;
32  grid->n = n;
33  grid->A = A;
34  grid->P = NULL;
35  grid->R = NULL;
36  grid->next = NULL;
37  grid->prev = NULL;
38  grid->delete_top_level_A = FALSE;
39  grid->matching = MALLOC(sizeof(real)*(n));
40  grid->deg = NULL;
42 
43  if (level == 0){
44  real modularity = 0;
45  int *ia = A->ia, *ja = A->ja, n = A->n;
46  real deg_total = 0;
47  real *deg, *a = (real*) (A->a);
48  real *indeg;
49 
50  grid->deg_total = 0.;
51  grid->deg = MALLOC(sizeof(real)*(n));
52  deg = grid->deg;
53 
54  indeg = MALLOC(sizeof(real)*n);
55  for (i = 0; i < n; i++){
56  deg[i] = 0;
57  indeg[i] = 0.;
58  for (j = ia[i]; j < ia[i+1]; j++){
59  deg[i] += a[j];
60  if (ja[j] == i) indeg[i] = a[j];
61  }
62  deg_total += deg[i];
63  }
64  if (deg_total == 0) deg_total = 1;
65  for (i = 0; i < n; i++){
66  modularity += (indeg[i] - deg[i]*deg[i]/deg_total)/deg_total;
67  }
68  grid->deg_total = deg_total;
69  grid->deg = deg;
70  grid->modularity = modularity;
71  FREE(indeg);
72  }
73 
74 
75  return grid;
76 }
77 
78 static void Multilevel_Modularity_Clustering_delete(Multilevel_Modularity_Clustering grid){
79  if (!grid) return;
80  if (grid->A){
81  if (grid->level == 0) {
82  if (grid->delete_top_level_A) SparseMatrix_delete(grid->A);
83  } else {
84  SparseMatrix_delete(grid->A);
85  }
86  }
87  SparseMatrix_delete(grid->P);
88  SparseMatrix_delete(grid->R);
89  FREE(grid->matching);
90  FREE(grid->deg);
91 
92  Multilevel_Modularity_Clustering_delete(grid->next);
93  FREE(grid);
94 }
95 
96 static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_establish(Multilevel_Modularity_Clustering grid, int ncluster_target){
97  int *matching = grid->matching;
98  SparseMatrix A = grid->A;
99  int n = grid->n, level = grid->level, nc = 0;
100  real modularity = 0;
101  int *ia = A->ia, *ja = A->ja;
102  real *a;
103  real *deg = grid->deg;
104  real *deg_new;
105  int i, j, jj, jc, jmax;
106  real inv_deg_total = 1./(grid->deg_total);
107  real *deg_inter, gain;
108  int *mask;
109  real maxgain;
110  real total_gain = 0;
111  modularity = grid->modularity;
112 
113  deg_new = MALLOC(sizeof(real)*n);
114  deg_inter = MALLOC(sizeof(real)*n);
115  mask = MALLOC(sizeof(int)*n);
116  for (i = 0; i < n; i++) mask[i] = -1;
117 
118  assert(n == A->n);
119  for (i = 0; i < n; i++) matching[i] = UNMATCHED;
120 
121  /* gain in merging node i into cluster j is
122  deg(i,j)/deg_total - 2*deg(i)*deg(j)/deg_total^2
123  */
124  a = (real*) A->a;
125  for (i = 0; i < n; i++){
126  if (matching[i] != UNMATCHED) continue;
127  /* accumulate connections between i and clusters */
128  for (j = ia[i]; j < ia[i+1]; j++){
129  jj = ja[j];
130  if (jj == i) continue;
131  if ((jc=matching[jj]) != UNMATCHED){
132  if (mask[jc] != i) {
133  mask[jc] = i;
134  deg_inter[jc] = a[j];
135  } else {
136  deg_inter[jc] += a[j];
137  }
138  }
139  }
140 
141  maxgain = 0;
142  jmax = -1;
143  for (j = ia[i]; j < ia[i+1]; j++){
144  jj = ja[j];
145  if (jj == i) continue;
146  if ((jc=matching[jj]) == UNMATCHED){
147  /* the first 2 is due to the fact that deg_iter gives edge weight from i to jj, but there are also edges from jj to i */
148  gain = (2*a[j] - 2*deg[i]*deg[jj]*inv_deg_total)*inv_deg_total;
149  } else {
150  if (deg_inter[jc] > 0){
151  /* the first 2 is due to the fact that deg_iter gives edge weight from i to jc, but there are also edges from jc to i */
152  gain = (2*deg_inter[jc] - 2*deg[i]*deg_new[jc]*inv_deg_total)*inv_deg_total;
153  // printf("mod = %f deg_inter[jc] =%f, deg[i] = %f, deg_new[jc]=%f, gain = %f\n",modularity, deg_inter[jc],deg[i],deg_new[jc],gain);
154  deg_inter[jc] = -1; /* so that we do not redo the calulation when we hit another neighbor in cluster jc */
155  } else {
156  gain = -1;
157  }
158  }
159  if (jmax < 0 || gain > maxgain){
160  maxgain = gain;
161  jmax = jj;
162  }
163 
164  }
165 
166  /* now merge i and jmax */
167  if (maxgain > 0 || grid->agglomerate_regardless){
168  total_gain += maxgain;
169  jc = matching[jmax];
170  if (jc == UNMATCHED){
171  //fprintf(stderr, "maxgain=%f, merge %d, %d\n",maxgain, i, jmax);
172  matching[i] = matching[jmax] = nc;
173  deg_new[nc] = deg[i] + deg[jmax];
174  nc++;
175  } else {
176  //fprintf(stderr, "maxgain=%f, merge with existing cluster %d, %d\n",maxgain, i, jc);
177  deg_new[jc] += deg[i];
178  matching[i] = jc;
179  }
180  } else {
181  assert(maxgain <= 0);
182  matching[i] = nc;
183  deg_new[nc] = deg[i];
184  nc++;
185  }
186 
187  }
188 
189  if (Verbose) fprintf(stderr,"modularity = %f new modularity = %f level = %d, n = %d, nc = %d, gain = %g\n", modularity, modularity + total_gain,
190  level, n, nc, total_gain);
191 
192  /* !!!!!!!!!!!!!!!!!!!!!! */
193  if (ncluster_target > 0){
194  if (nc <= ncluster_target && n >= ncluster_target){
195  if (n - ncluster_target > ncluster_target - nc){/* ncluster = nc */
196 
197  } else if (n - ncluster_target <= ncluster_target - nc){/* ncluster_target close to n */
198  fprintf(stderr,"ncluster_target = %d, close to n=%d\n", ncluster_target, n);
199  for (i = 0; i < n; i++) matching[i] = i;
200  FREE(deg_new);
201  goto RETURN;
202  }
203  } else if (n < ncluster_target){
204  fprintf(stderr,"n < target\n");
205  for (i = 0; i < n; i++) matching[i] = i;
206  FREE(deg_new);
207  goto RETURN;
208  }
209  }
210 
211  if (nc >= 1 && (total_gain > 0 || nc < n)){
212  /* now set up restriction and prolongation operator */
213  SparseMatrix P, R, R0, B, cA;
214  real one = 1.;
216 
218  for (i = 0; i < n; i++){
219  jj = matching[i];
220  SparseMatrix_coordinate_form_add_entries(R0, 1, &jj, &i, &one);
221  }
224  P = SparseMatrix_transpose(R);
225  B = SparseMatrix_multiply(R, A);
226  if (!B) goto RETURN;
227  cA = SparseMatrix_multiply(B, P);
228  if (!cA) goto RETURN;
230  grid->P = P;
231  grid->R = R;
232  level++;
233  cgrid = Multilevel_Modularity_Clustering_init(cA, level);
234  deg_new = REALLOC(deg_new, nc*sizeof(real));
235  cgrid->deg = deg_new;
236  cgrid->modularity = grid->modularity + total_gain;
237  cgrid->deg_total = grid->deg_total;
238  cgrid = Multilevel_Modularity_Clustering_establish(cgrid, ncluster_target);
239  grid->next = cgrid;
240  cgrid->prev = grid;
241  } else {
242  /* if we want a small number of cluster but right now we have too many, we will force agglomeration */
243  if (ncluster_target > 0 && nc > ncluster_target && !(grid->agglomerate_regardless)){
245  FREE(deg_inter);
246  FREE(mask);
247  FREE(deg_new);
248  return Multilevel_Modularity_Clustering_establish(grid, ncluster_target);
249  }
250  /* no more improvement, stop and final clustering found */
251  for (i = 0; i < n; i++) matching[i] = i;
252  FREE(deg_new);
253  }
254 
255  RETURN:
256  FREE(deg_inter);
257  FREE(mask);
258  return grid;
259 }
260 
261 static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_new(SparseMatrix A0, int ncluster_target){
262  /* ncluster_target is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
263  is desired. The resulting clustering will give as close to this number as possible.
264  If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
265  . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
266  . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
267  . selected among nc or nc2, which ever is closer to ncluster_target.
268  Default: ncluster_target <= 0 */
269 
271  SparseMatrix A = A0;
272 
275  }
276  grid = Multilevel_Modularity_Clustering_init(A, 0);
277 
278  grid = Multilevel_Modularity_Clustering_establish(grid, ncluster_target);
279 
280  if (A != A0) grid->delete_top_level_A = TRUE;/* be sure to clean up later */
281  return grid;
282 }
283 
284 
285 static void hierachical_modularity_clustering(SparseMatrix A, int ncluster_target,
286  int *nclusters, int **assignment, real *modularity, int *flag){
287  /* find a clustering of vertices by maximize modularity
288  A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1.
289 
290  ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
291  is desired. The resulting clustering will give as close to this number as possible.
292  If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
293  . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
294  . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
295  . selected among nc or nc2, which ever is closer to ncluster_target.
296  Default: ncluster_target <= 0
297 
298  nclusters: on output the number of clusters
299  assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters
300  */
301 
303  int *matching, i;
304  SparseMatrix P;
305  real *u;
306  assert(A->m == A->n);
307 
308  *modularity = 0.;
309 
310  *flag = 0;
311 
312  grid = Multilevel_Modularity_Clustering_new(A, ncluster_target);
313 
314  /* find coarsest */
315  cgrid = grid;
316  while (cgrid->next){
317  cgrid = cgrid->next;
318  }
319 
320  /* project clustering up */
321  u = MALLOC(sizeof(real)*cgrid->n);
322  for (i = 0; i < cgrid->n; i++) u[i] = (real) (cgrid->matching)[i];
323  *nclusters = cgrid->n;
324  *modularity = cgrid->modularity;
325 
326  while (cgrid->prev){
327  real *v = NULL;
328  P = cgrid->prev->P;
330  FREE(u);
331  u = v;
332  cgrid = cgrid->prev;
333  }
334 
335  if (*assignment){
336  matching = *assignment;
337  } else {
338  matching = MALLOC(sizeof(int)*(grid->n));
339  *assignment = matching;
340  }
341  for (i = 0; i < grid->n; i++) (matching)[i] = (int) u[i];
342  FREE(u);
343 
344  Multilevel_Modularity_Clustering_delete(grid);
345 
346 }
347 
348 
349 
350 void modularity_clustering(SparseMatrix A, int inplace, int ncluster_target, int use_value,
351  int *nclusters, int **assignment, real *modularity, int *flag){
352  /* find a clustering of vertices by maximize modularity
353  A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1.
354  inplace: whether A can e modified. If true, A will be modified by removing diagonal.
355  ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
356  is desired. The resulting clustering will give as close to this number as possible.
357  If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
358  . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
359  . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
360  . selected among nc or nc2, which ever is closer to ncluster_target.
361  Default: ncluster_target <= 0
362  nclusters: on output the number of clusters
363  assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters
364  */
365  SparseMatrix B;
366 
367  *flag = 0;
368 
369  assert(A->m == A->n);
370 
372 
373  if (!inplace && B == A) {
374  B = SparseMatrix_copy(A);
375  }
376 
378 
379  if (B->type != MATRIX_TYPE_REAL || !use_value) B = SparseMatrix_set_entries_to_real_one(B);
380 
381  hierachical_modularity_clustering(B, ncluster_target, nclusters, assignment, modularity, flag);
382 
383  if (B != A) SparseMatrix_delete(B);
384 
385 }
Multilevel_Modularity_Clustering next
Definition: clustering.h:25
void modularity_clustering(SparseMatrix A, int inplace, int ncluster_target, int use_value, int *nclusters, int **assignment, real *modularity, int *flag)
Definition: clustering.c:350
SparseMatrix SparseMatrix_set_entries_to_real_one(SparseMatrix A)
SparseMatrix SparseMatrix_copy(SparseMatrix A)
#define FREE
Definition: PriorityQueue.c:23
SparseMatrix SparseMatrix_new(int m, int n, int nz, int type, int format)
Definition: SparseMatrix.c:384
#define assert(x)
Definition: cghdr.h:47
SparseMatrix SparseMatrix_remove_diagonal(SparseMatrix A)
int SparseMatrix_is_symmetric(SparseMatrix A, int test_pattern_symmetry_only)
Definition: SparseMatrix.c:178
void SparseMatrix_multiply_vector(SparseMatrix A, real *v, real **res, int transposed)
int
Definition: grammar.c:1264
SparseMatrix SparseMatrix_get_real_adjacency_matrix_symmetrized(SparseMatrix A)
#define MALLOC
Definition: PriorityQueue.c:21
Multilevel_Modularity_Clustering prev
Definition: clustering.h:26
SparseMatrix SparseMatrix_symmetrize(SparseMatrix A, int pattern_symmetric_only)
Definition: SparseMatrix.c:151
void SparseMatrix_delete(SparseMatrix A)
Definition: SparseMatrix.c:411
#define REALLOC
Definition: PriorityQueue.c:22
#define NULL
Definition: logic.h:39
EXTERN unsigned char Verbose
Definition: globals.h:64
for(;;)
Definition: grammar.c:1846
SparseMatrix SparseMatrix_multiply(SparseMatrix A, SparseMatrix B)
SparseMatrix SparseMatrix_coordinate_form_add_entries(SparseMatrix A, int nentries, int *irn, int *jcn, void *val)
#define FALSE
Definition: cgraph.h:35
SparseMatrix SparseMatrix_from_coordinate_format(SparseMatrix A)
Definition: SparseMatrix.c:797
SparseMatrix SparseMatrix_transpose(SparseMatrix A)
Definition: SparseMatrix.c:69
#define TRUE
Definition: cgraph.h:38
#define real
Definition: general.h:34