Cloned library GKlib with extra build files for internal package management.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

301 lines
9.6 KiB

/*!
\file
\brief It takes as input two CSR matrices A and B and computes how
similar AA' and A'A are to BB' and B'B, respectively in terms
of the cosine similarity of the corresponding rows.
\date 11/09/2015
\author George
\version \verbatim $Id: m2mnbrs.c 17699 2014-09-27 18:05:31Z karypis $ \endverbatim
*/
#include <GKlib.h>
/*************************************************************************/
/*! Data structures for the code */
/*************************************************************************/
typedef struct {
int simtype; /*!< The similarity type to use */
int verbosity; /*!< The reporting verbosity level */
char *afile; /*!< The file storing the query documents */
char *bfile; /*!< The file storing the collection documents */
/* timers */
double timer_global;
} params_t;
/*************************************************************************/
/*! Constants */
/*************************************************************************/
/* Versions */
#define VER_MAJOR 0
#define VER_MINOR 1
#define VER_SUBMINOR 0
/* Command-line option codes */
#define CMD_SIMTYPE 10
#define CMD_VERBOSITY 70
#define CMD_HELP 100
/* The text labels for the different simtypes */
static char simtypenames[][10] = {"", "dotp", "cos", "jac", ""};
/*************************************************************************/
/*! Local variables */
/*************************************************************************/
static struct gk_option long_options[] = {
{"simtype", 1, 0, CMD_SIMTYPE},
{"verbosity", 1, 0, CMD_VERBOSITY},
{"help", 0, 0, CMD_HELP},
{0, 0, 0, 0}
};
static gk_StringMap_t simtype_options[] = {
{"dotp", GK_CSR_DOTP},
{"cos", GK_CSR_COS},
{"jac", GK_CSR_JAC},
{NULL, 0}
};
/*-------------------------------------------------------------------
* Mini help
*-------------------------------------------------------------------*/
static char helpstr[][100] =
{
" ",
"Usage: cmpnbrs [options] afile bfile",
" ",
" Options",
" -simtype=string",
" Specifies the type of similarity to use. Possible values are:",
" dotp - Dot-product similarity [default]",
" cos - Cosine similarity",
" jac - Jacquard similarity",
" ",
" -verbosity=int",
" Specifies the level of debugging information to be displayed.",
" Default value is 0.",
" ",
" -help",
" Prints this message.",
""
};
/*************************************************************************/
/*! Function prototypes */
/*************************************************************************/
params_t *parse_cmdline(int argc, char *argv[]);
double ComputeNeighborhoodSimilarity(params_t *params, gk_csr_t *amat, gk_csr_t *bmat);
/*************************************************************************/
/*! This is the entry point of the command-line argument parser */
/*************************************************************************/
params_t *parse_cmdline(int argc, char *argv[])
{
int i;
int c, option_index;
params_t *params;
params = (params_t *)gk_malloc(sizeof(params_t), "parse_cmdline: params");
/* initialize the params data structure */
params->simtype = GK_CSR_DOTP;
params->verbosity = -1;
params->afile = NULL;
params->bfile = NULL;
/* Parse the command line arguments */
while ((c = gk_getopt_long_only(argc, argv, "", long_options, &option_index)) != -1) {
switch (c) {
case CMD_SIMTYPE:
if (gk_optarg) {
if ((params->simtype = gk_GetStringID(simtype_options, gk_optarg)) == -1)
errexit("Invalid simtype of %s.\n", gk_optarg);
}
break;
case CMD_VERBOSITY:
if (gk_optarg) params->verbosity = atoi(gk_optarg);
break;
case CMD_HELP:
for (i=0; strlen(helpstr[i]) > 0; i++)
printf("%s\n", helpstr[i]);
exit(EXIT_SUCCESS);
break;
case '?':
default:
printf("Illegal command-line option(s)\nUse %s -help for a summary of the options.\n", argv[0]);
exit(EXIT_FAILURE);
}
}
/* Get the input/output file info */
if (argc-gk_optind != 2) {
printf("Missing input file info.\n Use %s -help for a summary of the options.\n", argv[0]);
exit(EXIT_FAILURE);
}
params->afile = gk_strdup(argv[gk_optind++]);
params->bfile = gk_strdup(argv[gk_optind++]);
if (!gk_fexists(params->afile))
errexit("input file %s does not exist.\n", params->afile);
if (!gk_fexists(params->bfile))
errexit("input file %s does not exist.\n", params->bfile);
return params;
}
/*************************************************************************/
/*! This is the entry point of the program */
/**************************************************************************/
int main(int argc, char *argv[])
{
params_t *params;
gk_csr_t *amat, *bmat, *amatt, *bmatt;
int rc = EXIT_SUCCESS;
params = parse_cmdline(argc, argv);
amat = gk_csr_Read(params->afile, GK_CSR_FMT_CSR, 1, 0);
bmat = gk_csr_Read(params->bfile, GK_CSR_FMT_CSR, 1, 0);
/* make the matrices of similar dimensions (if neccessary) */
GKASSERT(amat->nrows == bmat->nrows);
amat->ncols = gk_max(amat->ncols, bmat->ncols);
bmat->ncols = amat->ncols;
/* create the transpose matrices */
amatt = gk_csr_Transpose(amat);
bmatt = gk_csr_Transpose(bmat);
printf("********************************************************************************\n");
printf("cmpnbrs (%d.%d.%d) Copyright 2015, GK.\n", VER_MAJOR, VER_MINOR, VER_SUBMINOR);
printf(" simtype=%s\n",
simtypenames[params->simtype]);
printf(" afile=%s, nrows=%d, ncols=%d, nnz=%zd\n",
params->afile, amat->nrows, amat->ncols, amat->rowptr[amat->nrows]);
printf(" bfile=%s, nrows=%d, ncols=%d, nnz=%zd\n",
params->bfile, bmat->nrows, bmat->ncols, bmat->rowptr[bmat->nrows]);
gk_clearwctimer(params->timer_global);
gk_startwctimer(params->timer_global);
printf("SIM(AA', BB'): %.5lf\t", ComputeNeighborhoodSimilarity(params, amat, bmat));
printf("SIM(A'A, B'B): %.5lf\n", ComputeNeighborhoodSimilarity(params, amatt, bmatt));
gk_stopwctimer(params->timer_global);
printf(" wclock: %.2lfs\n", gk_getwctimer(params->timer_global));
printf("********************************************************************************\n");
gk_csr_Free(&amat);
gk_csr_Free(&bmat);
gk_csr_Free(&amatt);
gk_csr_Free(&bmatt);
exit(rc);
}
/*************************************************************************/
/*! Compares the neighbors of AA' vs BB' */
/**************************************************************************/
double ComputeNeighborhoodSimilarity(params_t *params, gk_csr_t *amat,
gk_csr_t *bmat)
{
int iR, iH, nahits, nbhits, ncmps;
int32_t *marker;
gk_fkv_t *ahits, *bhits, *cand;
double tabsim, abdot, anorm2, bnorm2, *avec, *bvec;
/* if cosine, make rows unit length */
if (params->simtype == GK_CSR_COS) {
gk_csr_Normalize(amat, GK_CSR_ROW, 2);
gk_csr_Normalize(bmat, GK_CSR_ROW, 2);
}
/* create the inverted index */
gk_csr_CreateIndex(amat, GK_CSR_COL);
gk_csr_CreateIndex(bmat, GK_CSR_COL);
/* compute the row squared norms */
gk_csr_ComputeSquaredNorms(amat, GK_CSR_ROW);
gk_csr_ComputeSquaredNorms(bmat, GK_CSR_ROW);
/* allocate memory for the necessary working arrays */
ahits = gk_fkvmalloc(amat->nrows, "ComputeNeighborhoodSimilarity: ahits");
bhits = gk_fkvmalloc(bmat->nrows, "ComputeNeighborhoodSimilarity: bhits");
marker = gk_i32smalloc(amat->nrows, -1, "ComputeNeighborhoodSimilarity: marker");
cand = gk_fkvmalloc(amat->nrows, "ComputeNeighborhoodSimilarity: cand");
avec = gk_dsmalloc(amat->nrows, 0.0, "ComputeNeighborhoodSimilarity: avec");
bvec = gk_dsmalloc(bmat->nrows, 0.0, "ComputeNeighborhoodSimilarity: bvec");
/* find the best neighbors for each row in the two matrices and compute
the cosine similarity between them. */
tabsim = 0.0;
ncmps = 0;
for (iR=0; iR<amat->nrows; iR++) {
if (params->verbosity > 1)
printf("Working on row %7d\n", iR);
if (amat->rowptr[iR+1]-amat->rowptr[iR] == 0 ||
bmat->rowptr[iR+1]-bmat->rowptr[iR] == 0)
continue;
nahits = gk_csr_GetSimilarRows(amat,
amat->rowptr[iR+1]-amat->rowptr[iR],
amat->rowind+amat->rowptr[iR],
amat->rowval+amat->rowptr[iR],
params->simtype, amat->nrows, 0.0,
ahits, marker, cand);
nbhits = gk_csr_GetSimilarRows(bmat,
bmat->rowptr[iR+1]-bmat->rowptr[iR],
bmat->rowind+bmat->rowptr[iR],
bmat->rowval+bmat->rowptr[iR],
params->simtype, bmat->nrows, 0.0,
bhits, marker, cand);
if (params->verbosity > 0)
printf("Row %7d %7d %7d %8zd %8zd\n", iR, nahits, nbhits,
amat->rowptr[iR+1]-amat->rowptr[iR], bmat->rowptr[iR+1]-bmat->rowptr[iR]);
for (iH=0; iH<nahits; iH++)
avec[ahits[iH].val] = ahits[iH].key;
for (iH=0; iH<nbhits; iH++)
bvec[bhits[iH].val] = bhits[iH].key;
for (abdot=anorm2=bnorm2=0.0, iH=0; iH<amat->nrows; iH++) {
abdot += avec[iH]*bvec[iH];
anorm2 += avec[iH]*avec[iH];
bnorm2 += bvec[iH]*bvec[iH];
}
tabsim += (abdot > 0 ? abdot/sqrt(anorm2*bnorm2) : 0.0);
ncmps++;
for (iH=0; iH<nahits; iH++)
avec[ahits[iH].val] = 0.0;
for (iH=0; iH<nbhits; iH++)
bvec[bhits[iH].val] = 0.0;
}
gk_free((void **)&ahits, &bhits, &marker, &cand, &avec, &bvec, LTERM);
return tabsim/ncmps;
}