You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
210 lines
6.8 KiB
210 lines
6.8 KiB
/*!
|
|
* \file
|
|
* \brief Frequent/Closed itemset discovery routines
|
|
*
|
|
* This file contains the code for finding frequent/closed itemests. These routines
|
|
* are implemented using a call-back mechanism to deal with the discovered itemsets.
|
|
*
|
|
* \date 6/13/2008
|
|
* \author George Karypis
|
|
* \version\verbatim $Id: itemsets.c 19240 2015-10-22 12:41:19Z karypis $ \endverbatim
|
|
*/
|
|
|
|
#include <GKlib.h>
|
|
|
|
/*-------------------------------------------------------------*/
|
|
/*! Data structures for use within this module */
|
|
/*-------------------------------------------------------------*/
|
|
typedef struct {
|
|
int minfreq; /* the minimum frequency of a pattern */
|
|
int maxfreq; /* the maximum frequency of a pattern */
|
|
int minlen; /* the minimum length of the requested pattern */
|
|
int maxlen; /* the maximum length of the requested pattern */
|
|
int tnitems; /* the initial range of the item space */
|
|
|
|
/* the call-back function */
|
|
void (*callback)(void *stateptr, int nitems, int *itemids, int ntrans, int *transids);
|
|
void *stateptr; /* the user-supplied pointer to pass to the callback */
|
|
|
|
/* workspace variables */
|
|
int *rmarker;
|
|
gk_ikv_t *cand;
|
|
} isparams_t;
|
|
|
|
|
|
/*-------------------------------------------------------------*/
|
|
/*! Prototypes for this module */
|
|
/*-------------------------------------------------------------*/
|
|
void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat,
|
|
int preflen, int *prefix);
|
|
gk_csr_t *itemsets_project_matrix(isparams_t *param, gk_csr_t *mat, int cid);
|
|
|
|
|
|
|
|
/*************************************************************************/
|
|
/*! The entry point of the frequent itemset discovery code */
|
|
/*************************************************************************/
|
|
void gk_find_frequent_itemsets(int ntrans, ssize_t *tranptr, int *tranind,
|
|
int minfreq, int maxfreq, int minlen, int maxlen,
|
|
void (*process_itemset)(void *stateptr, int nitems, int *itemids,
|
|
int ntrans, int *transids),
|
|
void *stateptr)
|
|
{
|
|
ssize_t i;
|
|
gk_csr_t *mat, *pmat;
|
|
isparams_t params;
|
|
int *pattern;
|
|
|
|
/* Create the matrix */
|
|
mat = gk_csr_Create();
|
|
mat->nrows = ntrans;
|
|
mat->ncols = tranind[gk_iargmax(tranptr[ntrans], tranind, 1)]+1;
|
|
mat->rowptr = gk_zcopy(ntrans+1, tranptr, gk_zmalloc(ntrans+1, "gk_find_frequent_itemsets: mat.rowptr"));
|
|
mat->rowind = gk_icopy(tranptr[ntrans], tranind, gk_imalloc(tranptr[ntrans], "gk_find_frequent_itemsets: mat.rowind"));
|
|
mat->colids = gk_iincset(mat->ncols, 0, gk_imalloc(mat->ncols, "gk_find_frequent_itemsets: mat.colids"));
|
|
|
|
/* Setup the parameters */
|
|
params.minfreq = minfreq;
|
|
params.maxfreq = (maxfreq == -1 ? mat->nrows : maxfreq);
|
|
params.minlen = minlen;
|
|
params.maxlen = (maxlen == -1 ? mat->ncols : maxlen);
|
|
params.tnitems = mat->ncols;
|
|
params.callback = process_itemset;
|
|
params.stateptr = stateptr;
|
|
params.rmarker = gk_ismalloc(mat->nrows, 0, "gk_find_frequent_itemsets: rmarker");
|
|
params.cand = gk_ikvmalloc(mat->ncols, "gk_find_frequent_itemsets: cand");
|
|
|
|
/* Perform the initial projection */
|
|
gk_csr_CreateIndex(mat, GK_CSR_COL);
|
|
pmat = itemsets_project_matrix(¶ms, mat, -1);
|
|
gk_csr_Free(&mat);
|
|
|
|
pattern = gk_imalloc(pmat->ncols, "gk_find_frequent_itemsets: pattern");
|
|
itemsets_find_frequent_itemsets(¶ms, pmat, 0, pattern);
|
|
|
|
gk_csr_Free(&pmat);
|
|
gk_free((void **)&pattern, ¶ms.rmarker, ¶ms.cand, LTERM);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*************************************************************************/
|
|
/*! The recursive routine for DFS-based frequent pattern discovery */
|
|
/*************************************************************************/
|
|
void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat,
|
|
int preflen, int *prefix)
|
|
{
|
|
ssize_t i;
|
|
gk_csr_t *cmat;
|
|
|
|
/* Project each frequent column */
|
|
for (i=0; i<mat->ncols; i++) {
|
|
prefix[preflen] = mat->colids[i];
|
|
|
|
if (preflen+1 >= params->minlen)
|
|
(*params->callback)(params->stateptr, preflen+1, prefix,
|
|
mat->colptr[i+1]-mat->colptr[i], mat->colind+mat->colptr[i]);
|
|
|
|
if (preflen+1 < params->maxlen) {
|
|
cmat = itemsets_project_matrix(params, mat, i);
|
|
itemsets_find_frequent_itemsets(params, cmat, preflen+1, prefix);
|
|
gk_csr_Free(&cmat);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
/******************************************************************************/
|
|
/*! This function projects a matrix w.r.t. to a particular column.
|
|
It performs the following steps:
|
|
- Determines the length of each column that is remaining.
|
|
- Sorts the columns in increasing length.
|
|
- Creates a column-based version of the matrix with the proper
|
|
column ordering.
|
|
*/
|
|
/*******************************************************************************/
|
|
gk_csr_t *itemsets_project_matrix(isparams_t *params, gk_csr_t *mat, int cid)
|
|
{
|
|
ssize_t i, j, k, ii, pnnz;
|
|
int nrows, ncols, pnrows, pncols;
|
|
ssize_t *colptr, *pcolptr;
|
|
int *colind, *colids, *pcolind, *pcolids, *rmarker;
|
|
gk_csr_t *pmat;
|
|
gk_ikv_t *cand;
|
|
|
|
nrows = mat->nrows;
|
|
ncols = mat->ncols;
|
|
colptr = mat->colptr;
|
|
colind = mat->colind;
|
|
colids = mat->colids;
|
|
|
|
rmarker = params->rmarker;
|
|
cand = params->cand;
|
|
|
|
|
|
/* Allocate space for the projected matrix based on what you know thus far */
|
|
pmat = gk_csr_Create();
|
|
pmat->nrows = pnrows = (cid == -1 ? nrows : colptr[cid+1]-colptr[cid]);
|
|
|
|
|
|
/* Mark the rows that will be kept and determine the prowids */
|
|
if (cid == -1) { /* Initial projection */
|
|
gk_iset(nrows, 1, rmarker);
|
|
}
|
|
else { /* The other projections */
|
|
for (i=colptr[cid]; i<colptr[cid+1]; i++)
|
|
rmarker[colind[i]] = 1;
|
|
}
|
|
|
|
|
|
/* Determine the length of each column that will be left in the projected matrix */
|
|
for (pncols=0, pnnz=0, i=cid+1; i<ncols; i++) {
|
|
for (k=0, j=colptr[i]; j<colptr[i+1]; j++) {
|
|
k += rmarker[colind[j]];
|
|
}
|
|
if (k >= params->minfreq && k <= params->maxfreq) {
|
|
cand[pncols].val = i;
|
|
cand[pncols++].key = k;
|
|
pnnz += k;
|
|
}
|
|
}
|
|
|
|
/* Sort the columns in increasing order */
|
|
gk_ikvsorti(pncols, cand);
|
|
|
|
|
|
/* Allocate space for the remaining fields of the projected matrix */
|
|
pmat->ncols = pncols;
|
|
pmat->colids = pcolids = gk_imalloc(pncols, "itemsets_project_matrix: pcolids");
|
|
pmat->colptr = pcolptr = gk_zmalloc(pncols+1, "itemsets_project_matrix: pcolptr");
|
|
pmat->colind = pcolind = gk_imalloc(pnnz, "itemsets_project_matrix: pcolind");
|
|
|
|
|
|
/* Populate the projected matrix */
|
|
pcolptr[0] = 0;
|
|
for (pnnz=0, ii=0; ii<pncols; ii++) {
|
|
i = cand[ii].val;
|
|
for (j=colptr[i]; j<colptr[i+1]; j++) {
|
|
if (rmarker[colind[j]])
|
|
pcolind[pnnz++] = colind[j];
|
|
}
|
|
|
|
pcolids[ii] = colids[i];
|
|
pcolptr[ii+1] = pnnz;
|
|
}
|
|
|
|
|
|
/* Reset the rmarker array */
|
|
if (cid == -1) { /* Initial projection */
|
|
gk_iset(nrows, 0, rmarker);
|
|
}
|
|
else { /* The other projections */
|
|
for (i=colptr[cid]; i<colptr[cid+1]; i++)
|
|
rmarker[colind[i]] = 0;
|
|
}
|
|
|
|
|
|
return pmat;
|
|
}
|
|
|