Cloned library GKlib with extra build files for internal package management.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

305 lines
9.5 KiB

2 years ago
/*!
\file
\brief It takes as input two CSR matrices and finds for each row of the
first matrix the most similar rows in the second matrix.
\date 9/27/2014
\author George
\version \verbatim $Id: m2mnbrs.c 17699 2014-09-27 18:05:31Z karypis $ \endverbatim
*/
#include <GKlib.h>
/*************************************************************************/
/*! Data structures for the code */
/*************************************************************************/
typedef struct {
int simtype; /*!< The similarity type to use */
int nnbrs; /*!< The maximum number of nearest neighbots to output */
float minsim; /*!< The minimum similarity to use for keeping neighbors */
int verbosity; /*!< The reporting verbosity level */
char *qfile; /*!< The file storing the query documents */
char *cfile; /*!< The file storing the collection documents */
char *outfile; /*!< The file where the output will be stored */
/* timers */
double timer_global;
double timer_1;
double timer_2;
double timer_3;
double timer_4;
} params_t;
/*************************************************************************/
/*! Constants */
/*************************************************************************/
/* Versions */
#define VER_MAJOR 0
#define VER_MINOR 1
#define VER_SUBMINOR 0
/* Command-line option codes */
#define CMD_SIMTYPE 10
#define CMD_NNBRS 20
#define CMD_MINSIM 22
#define CMD_VERBOSITY 70
#define CMD_HELP 100
/* The text labels for the different simtypes */
static char simtypenames[][10] = {"", "dotp", "cos", "jac", ""};
/*************************************************************************/
/*! Local variables */
/*************************************************************************/
static struct gk_option long_options[] = {
{"simtype", 1, 0, CMD_SIMTYPE},
{"nnbrs", 1, 0, CMD_NNBRS},
{"minsim", 1, 0, CMD_MINSIM},
{"verbosity", 1, 0, CMD_VERBOSITY},
{"help", 0, 0, CMD_HELP},
{0, 0, 0, 0}
};
static gk_StringMap_t simtype_options[] = {
{"cos", GK_CSR_COS},
{"jac", GK_CSR_JAC},
{NULL, 0}
};
/*-------------------------------------------------------------------
* Mini help
*-------------------------------------------------------------------*/
static char helpstr[][100] =
{
" ",
"Usage: m2mnbrs [options] qfile cfile [outfile]",
" ",
" Options",
" -simtype=string",
" Specifies the type of similarity to use. Possible values are:",
" cos - Cosine similarity",
" jac - Jacquard similarity [default]",
" ",
" -nnbrs=int",
" Specifies the maximum number of nearest neighbors.",
" A value of -1 indicates that all neighbors will be considered.",
" Default value is 100.",
" ",
" -minsim=float",
" The minimum allowed similarity between neighbors. ",
" Default value is .25.",
" ",
" -verbosity=int",
" Specifies the level of debugging information to be displayed.",
" Default value is 0.",
" ",
" -help",
" Prints this message.",
""
};
/*************************************************************************/
/*! Function prototypes */
/*************************************************************************/
params_t *parse_cmdline(int argc, char *argv[]);
void FindNeighbors(params_t *params, gk_csr_t *qmat, gk_csr_t *cmat);
/*************************************************************************/
/*! This is the entry point of the command-line argument parser */
/*************************************************************************/
params_t *parse_cmdline(int argc, char *argv[])
{
int i;
int c, option_index;
params_t *params;
params = (params_t *)gk_malloc(sizeof(params_t), "parse_cmdline: params");
/* initialize the params data structure */
params->simtype = GK_CSR_JAC;
params->nnbrs = 100;
params->minsim = .25;
params->verbosity = -1;
params->qfile = NULL;
params->cfile = NULL;
params->outfile = NULL;
/* Parse the command line arguments */
while ((c = gk_getopt_long_only(argc, argv, "", long_options, &option_index)) != -1) {
switch (c) {
case CMD_SIMTYPE:
if (gk_optarg) {
if ((params->simtype = gk_GetStringID(simtype_options, gk_optarg)) == -1)
errexit("Invalid simtype of %s.\n", gk_optarg);
}
break;
case CMD_NNBRS:
if (gk_optarg) params->nnbrs = atoi(gk_optarg);
break;
case CMD_MINSIM:
if (gk_optarg) params->minsim = atof(gk_optarg);
break;
case CMD_VERBOSITY:
if (gk_optarg) params->verbosity = atoi(gk_optarg);
break;
case CMD_HELP:
for (i=0; strlen(helpstr[i]) > 0; i++)
printf("%s\n", helpstr[i]);
exit(EXIT_SUCCESS);
break;
case '?':
default:
printf("Illegal command-line option(s)\nUse %s -help for a summary of the options.\n", argv[0]);
exit(EXIT_FAILURE);
}
}
/* Get the input/output file info */
if (argc-gk_optind < 1) {
printf("Missing input/output file info.\n Use %s -help for a summary of the options.\n", argv[0]);
exit(EXIT_FAILURE);
}
params->qfile = gk_strdup(argv[gk_optind++]);
params->cfile = gk_strdup(argv[gk_optind++]);
params->outfile = (gk_optind < argc ? gk_strdup(argv[gk_optind++]) : NULL);
if (!gk_fexists(params->qfile))
errexit("input file %s does not exist.\n", params->qfile);
if (!gk_fexists(params->cfile))
errexit("input file %s does not exist.\n", params->cfile);
return params;
}
/*************************************************************************/
/*! This is the entry point of the program */
/**************************************************************************/
int main(int argc, char *argv[])
{
params_t *params;
gk_csr_t *qmat, *cmat;
int rc = EXIT_SUCCESS;
params = parse_cmdline(argc, argv);
qmat = gk_csr_Read(params->qfile, GK_CSR_FMT_CSR, 1, 0);
cmat = gk_csr_Read(params->cfile, GK_CSR_FMT_CSR, 1, 0);
printf("********************************************************************************\n");
printf("sd (%d.%d.%d) Copyright 2014, GK.\n", VER_MAJOR, VER_MINOR, VER_SUBMINOR);
printf(" simtype=%s, nnbrs=%d, minsim=%.2f\n",
simtypenames[params->simtype], params->nnbrs, params->minsim);
printf(" qfile=%s, nrows=%d, ncols=%d, nnz=%zd\n",
params->qfile, qmat->nrows, qmat->ncols, qmat->rowptr[qmat->nrows]);
printf(" cfile=%s, nrows=%d, ncols=%d, nnz=%zd\n",
params->cfile, cmat->nrows, cmat->ncols, cmat->rowptr[cmat->nrows]);
gk_clearwctimer(params->timer_global);
gk_clearwctimer(params->timer_1);
gk_clearwctimer(params->timer_2);
gk_clearwctimer(params->timer_3);
gk_clearwctimer(params->timer_4);
gk_startwctimer(params->timer_global);
FindNeighbors(params, qmat, cmat);
gk_stopwctimer(params->timer_global);
printf(" wclock: %.2lfs\n", gk_getwctimer(params->timer_global));
printf(" timer1: %.2lfs\n", gk_getwctimer(params->timer_1));
printf(" timer2: %.2lfs\n", gk_getwctimer(params->timer_2));
printf(" timer3: %.2lfs\n", gk_getwctimer(params->timer_3));
printf(" timer4: %.2lfs\n", gk_getwctimer(params->timer_4));
printf("********************************************************************************\n");
gk_csr_Free(&qmat);
gk_csr_Free(&cmat);
exit(rc);
}
/*************************************************************************/
/*! Reads and computes the neighbors of each query document against the
collection of documents */
/**************************************************************************/
void FindNeighbors(params_t *params, gk_csr_t *qmat, gk_csr_t *cmat)
{
int iQ, iH, nhits;
int32_t *marker;
gk_fkv_t *hits, *cand;
FILE *fpout;
GKASSERT(qmat->ncols <= cmat->ncols);
/* if cosine, make rows unit length */
if (params->simtype == GK_CSR_COS) {
gk_csr_Normalize(qmat, GK_CSR_ROW, 2);
gk_csr_Normalize(cmat, GK_CSR_ROW, 2);
}
/* create the inverted index */
gk_csr_CreateIndex(cmat, GK_CSR_COL);
/* compute the row norms */
gk_csr_ComputeSquaredNorms(cmat, GK_CSR_ROW);
/* create the output file */
fpout = (params->outfile ? gk_fopen(params->outfile, "w", "FindNeighbors: fpout") : NULL);
/* allocate memory for the necessary working arrays */
hits = gk_fkvmalloc(cmat->nrows, "FindNeighbors: hits");
marker = gk_i32smalloc(cmat->nrows, -1, "FindNeighbors: marker");
cand = gk_fkvmalloc(cmat->nrows, "FindNeighbors: cand");
/* find the best neighbors for each query document */
gk_startwctimer(params->timer_1);
for (iQ=0; iQ<qmat->nrows; iQ++) {
if (params->verbosity > 0)
printf("Working on query %7d\n", iQ);
/* find the neighbors of the ith document */
nhits = gk_csr_GetSimilarRows(cmat,
qmat->rowptr[iQ+1]-qmat->rowptr[iQ],
qmat->rowind+qmat->rowptr[iQ],
qmat->rowval+qmat->rowptr[iQ],
params->simtype, params->nnbrs, params->minsim,
hits, marker, cand);
/* write the results in the file */
if (fpout) {
for (iH=0; iH<nhits; iH++)
fprintf(fpout, "%8d %8zd %.3f\n", iQ, hits[iH].val, hits[iH].key);
}
}
gk_stopwctimer(params->timer_1);
/* cleanup and exit */
if (fpout) gk_fclose(fpout);
gk_free((void **)&hits, &marker, &cand, LTERM);
}