/*
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
**  long with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**--------------------------------------------------------------------
** ** ** PATCHED 5/13/96, CJC
**
** Added code to countwords and countwordstr to disreguard the last char
** if requiered by the config.h
** G. Hill  3/12/97  ghill@library.berkeley.edu
**
** Changed addentry, countwords, countwordstr, parsecomment, rintindex
** added createMetaEntryList, getMeta, parseMetaData
** to support METADATA
** G. Hill 3/18/97 ghill@library.berkeley.edu
**
** Changed removestops to support printing of stop words
** G. Hill 4/7/97
**
** Changed countwords, countwrdstr, and parseMetaData to disreguard the
** first char if required by the config.h
** G.Hill 10/16/97  ghill@library.berkeley.edu
**
** Added stripIgnoreLastChars and isIgnoreLastChar routines which iteratively
** remove all ignore characters from the end of each word.
** P. Bergner  10/5/97  bergner@lcse.umn.edu
**
** Added stripIgnoreFirstChars and isIgnoreFirstChar to make stripping of
** the ignore first chars iterative.
** G. Hill 11/19/97 ghill@library.berkeley.edu
**
** Added possibility of use of quotes and brackets in meta CONTENT countwords, parsemetadata
** G. Hill 1/14/98
**
** Added regex for replace rule G.Hill 1/98
**
** REQMETANAME - don't index meta tags not specified in MetaNames
** 10/11/99 - Bill Moseley
**
** change sprintf to snprintf to avoid corruption, use MAXPROPLEN instead of literal "20",
** added include of merge.h - missing declaration caused compile error in prototypes,
** added word length arg to Stem() call for strcat overflow checking in stemmer.c
** added safestrcpy() macro to avoid corruption from strcpy overflow
** SRE 11/17/99
**
** fixed misc problems pointed out by "gcc -Wall"
** SRE 2/22/00
**
** Added code for storing word positions in index file 
** Jose Ruiz 3/00 jmruiz@boe.es
**
** 04/00 - Jose Ruiz
** Added code for a hash table in index file for searching words
** via getfileinfo in search.c (Lots of addons). Better perfomance
** with big databases and or heavy searchs (a* or b* or c*)
**
** 04/00 - Jose Ruiz
** Improved number compression function (compress)
** New number decompress function
** Both converted into macros for better performance
*/


#include "swish.h"
#include "index.h"
#include "hash.h"
#include "mem.h"
#include "string.h"
#include "check.h"
#include "search.h"
#include "merge.h"
#include "docprop.h"
#include "stemmer.h"
#include "soundex.h"
#include "error.h"
#include "file.h"

#define MAXPROPLEN 20

/* Stores file names in alphabetical order so they can be
** indexed alphabetically. No big whoop.
*/

struct docentryarray *addsortentry(e, filename, title)
struct docentryarray *e;
char *filename;
char *title;
{
	int i,j,k,isbigger;
	struct docentry *en;
	
	isbigger=0;
	if (e == NULL) {
		e = (struct docentryarray *) emalloc(sizeof(struct docentryarray));
		e->maxsize = SEARCHHASHSIZE;   /* Put what you like */
		e->dlist = (struct docentry **) emalloc(e->maxsize*sizeof(struct docentry *));
		e->currentsize = 1;
		en = (struct docentry *) emalloc(sizeof(struct docentry));
		en->filename = (char *) estrdup(filename);
		en->title = (char *) estrdup(title);
		e->dlist[0]=en;
	}
	else {
		/* Look for the position to insert using a binary search */
		i=e->currentsize-1;
		j=k=0;
		while(i>=j) {
			k=j+(i-j)/2;
			isbigger = strcmp(filename,e->dlist[k]->filename);
			if(!isbigger) 
				progerr("You have found a bug!!");
			else if(isbigger > 0) j=k+1;
			else i=k-1;
		}
		en = (struct docentry *) emalloc(sizeof(struct docentry));
		en->filename = (char *) estrdup(filename);
		en->title = (char *) estrdup(title);

		if (isbigger > 0) k++;
		e->currentsize++;
		if(e->currentsize==e->maxsize) {
			e->maxsize *=2;
			e->dlist=(struct docentry **) erealloc(e->dlist,e->maxsize*sizeof(struct docentry *)); 
		}
		for(i=e->currentsize;i>k;i--) e->dlist[i]=e->dlist[i-1];
		e->dlist[k] = en;
	}
	return e;
}

/* Adds a word to the master index tree.
*/

struct entryarray *addentry(e, word, filenum, emphasized, structure, metaName, position)
struct entryarray *e;
char *word;
int filenum;
int emphasized;
int structure;
int metaName;
int position;
{
	int i,j,k,isbigger;
	struct entry *en;
	struct location *tp, *oldtp;
	
	isbigger=0;
	oldtp=NULL;
	if (e == NULL) {
		e = (struct entryarray *) emalloc(sizeof(struct entryarray));
		e->maxsize = SEARCHHASHSIZE;   /* Put what you like */
		e->elist = (struct entry **) emalloc(e->maxsize*sizeof(struct entry *));
		e->currentsize = 1;
		en = (struct entry *) emalloc(sizeof(struct entry));
		en->word = (char *) estrdup(word);
		en->tfrequency = 1;
		en->currentfilenum = filenum;
		en->locationlist = (struct location *)
			emalloc(sizeof(struct location));
		en->currentlocation = en->locationlist;
		en->locationlist->filenum = filenum;
		en->locationlist->rank = 0;  /* Compute later */
		en->locationlist->frequency = 1;
		en->locationlist->emphasized = emphasized;
		en->locationlist->structure = structure;
		en->locationlist->metaName = metaName;
		en->locationlist->max_positions = 1;
		en->locationlist->position=(int *)emalloc(en->locationlist->max_positions * sizeof(int));
		en->locationlist->position[0]=position;

		en->locationlist->next = NULL;
		e->elist[0]=en;
		totalwords++;
	}
	else {
		/* Look for the position to insert using a binary search */
		i=e->currentsize-1;
		j=k=0;
		while(i>=j) {
			k=j+(i-j)/2;
			isbigger = strcmp(word,e->elist[k]->word);
			if(!isbigger) break;
			else if(isbigger > 0) j=k+1;
			else i=k-1;
		}
		if (isbigger == 0) {
			/* tp = e->elist[k]->locationlist; */
			/* Faster */
			tp = e->elist[k]->currentlocation; 
			while (tp != NULL) {
				if (tp->filenum == filenum && tp->metaName == metaName) 
					break;
				oldtp = tp;
				tp = tp->next;
			}
			if (tp == NULL) {
				tp = (struct location *) emalloc(sizeof(struct
					location));
				tp->filenum = filenum;
				tp->rank = 0;  /* Compute later */
				tp->frequency = 1;
				tp->emphasized = emphasized;
				tp->structure = structure;
				tp->metaName = metaName;
				tp->max_positions= 1;
				tp->position=(int *) emalloc(tp->max_positions * sizeof(int));
				tp->position[0]=position;
				tp->next = NULL;
				oldtp->next = tp;
				if (!emphasized)
					e->elist[k]->tfrequency = e->elist[k]->tfrequency + 1;
				if (e->elist[k]->currentfilenum != filenum) {
					e->elist[k]->currentfilenum = filenum;
					e->elist[k]->currentlocation = tp;
				}
			}
			else {
				if(tp->max_positions==tp->frequency) {
					tp->max_positions*=2;
					tp->position=(int *) erealloc(tp->position,tp->max_positions*sizeof(int));
				}
				tp->position[tp->frequency]=position;
				tp->frequency++;
				if (emphasized)
					tp->emphasized++;
				tp->structure |= structure;
			}
		}
		else 
		{
			en = (struct entry *) emalloc(sizeof(struct entry));
			en->word = (char *) estrdup(word);
			en->tfrequency = 1;
			en->currentfilenum =filenum;
			en->locationlist = (struct location *)
				emalloc(sizeof(struct location));
			en->currentlocation = en->locationlist;
			en->locationlist->filenum = filenum;
			en->locationlist->rank = 0;  /* Compute later */
			en->locationlist->frequency = 1;
			en->locationlist->emphasized = emphasized;
			en->locationlist->structure = structure;
			en->locationlist->metaName = metaName;
			en->locationlist->max_positions = 1;
			en->locationlist->position=(int *)emalloc(en->locationlist->max_positions * sizeof(int));
			en->locationlist->position[0]=position;

			en->locationlist->next = NULL;
			totalwords++;

			if (isbigger > 0) k++;
			e->currentsize++;
			if(e->currentsize==e->maxsize) {
				e->maxsize *=2;
				e->elist=(struct entry **) erealloc(e->elist,e->maxsize*sizeof(struct entry *)); 
			}
			for(i=e->currentsize;i>k;i--) e->elist[i]=e->elist[i-1];
			e->elist[k] = en;
		}
	}
	return e;
}

/* Adds a file to the master list of files and file numbers.
*/

struct file *addtofilelist(filep, filename, title, size, newFileEntry)
struct file *filep;
char *filename;
char *title;
int size;
struct file ** newFileEntry;
{
	struct file *newnode;
	static struct file *filelistp = NULL;
	
	newnode = (struct file *) emalloc(sizeof(struct file));
	if (newFileEntry != NULL)
	{
		*newFileEntry = newnode;	/* pass object pointer up to caller */
	}
	newnode->filename = (char *) estrdup(filename);
	newnode->title = (char *) estrdup(title);
	newnode->size = size;
	newnode->next = NULL;
	newnode->docProperties = NULL;

	if (filep == NULL)
		filep = newnode;
	else if (filelistp != NULL)
		filelistp->next = newnode;
	
	filelistp = newnode;
	
	return filep;
}

/* Just goes through the master list of files and
** counts 'em.
*/

int getfilecount(filep)
struct file *filep;
{
	int i;
	
	for (i = 0; filep != NULL; filep = filep->next)
		i++;
	
	return i;
}

/* Returns the nicely formatted date.
*/

char *getthedate()
{
	static char date[MAXSTRLEN];
	time_t time;
	
	time = (time_t) getthetime();
/*	strftime(date, MAXSTRLEN, "%x %X", (struct tm *) localtime(&time));*/
	/* 2/22/00 - switched to 4-digit year (%Y vs. %y) */
	strftime(date, MAXSTRLEN, "%d/%m/%Y %H:%M:%S %Z", (struct tm *) localtime(&time)); 
	
	return date;
}

/* Indexes all the words in a file and adds the appropriate information
** to the appropriate structures.
*/

int countwords(vp, filename, title, indextitleonly)
void *vp;
char *filename;
char *title;
int indextitleonly;
{
	int c, i, j, inword, ftotalwords, emphasized, structure;
	int metaNameOld,metaNameXML;
	static int *metaName=NULL;
	static int metaNamelen=0;
	int currentmetanames=0;
	static int filenum;
	static int lenword=0;
	static char *word=NULL;
	static int lentag=0;
	static char *tag=NULL;
	struct file *thisFileEntry = NULL;
	static int *positionMeta=NULL;    /* Position of word in file */
	int tmpposition=1;    /* Position of word in file */
	int docPropName;
	int tmpfilepos,cP;
	static int lenprop=0;
	static char *prop=NULL;

	if(!lenword) word=(char *)emalloc((lenword=MAXWORDLEN) + 1);
	if(!lentag) tag=(char *)emalloc((lentag=MAXSTRLEN) + 1);
	if(!lenprop) prop=(char *)emalloc((lenprop=MAXSTRLEN) + 1);
	if(!metaNamelen) {
		metaName=(int *)emalloc((metaNamelen=1)*sizeof(int));
		positionMeta =(int *)emalloc(metaNamelen*sizeof(int));
	}

	ftotalwords = 0;
	if (indextitleonly) {
		filelist = addtofilelist(filelist, filename, title, vsize(vp), NULL);
		filenum++;
		if (!(filenum % 128))
			filenum++;
		addtofwordtotals(filenum, 100);
		return (countwordstr(title, filenum, 0));
	}
	
	filelist = addtofilelist(filelist, filename, title, vsize(vp), &thisFileEntry);
	filenum++;
	if (!(filenum % 128))
		filenum++;
	
	c = 1;
	i = j = 0;
	inword = 0;
	emphasized = 0;
	structure = 1;
	metaName[0] = 1;
	positionMeta[0] = 1;
	currentmetanames = 0;
	
	while (c != EOF && (c = vgetc(vp)) != EOF) 
	{
		if (!inword) {
			if (iswordchar(c)) {
				i = 0;
				word[i++] = c;
				inword = 1;
			}
				/* Move word position if it is not a space 
			else if (!isspace((int)c)) position++; */
		}
		else if (inword) 
		{
			if (!iswordchar(c)) 
			{
				if (i == lenword) {
					lenword *=2;
					word = erealloc(word,lenword+1);
				}
				word[i] = '\0';
				
				/* Move this stuff after entities are converted
				for (i = 0; word[i]; i++)
				word[i] = tolower(word[i]);
				i = 0;
				*/
				if (isokword(word))
				{
					word = SafeStrCopy(word, (char *) convertentities(word),&lenword);
				}
				
				/* Ok, can now go to lowercase, the whole problem
				was with entities &Aacute; would become &aacute;
				*/
				for (i = 0; word[i]; i++)
					word[i] = tolower(word[i]);
				i = 0;
				
				/* Get rid of the last char's */
				stripIgnoreLastChars(word);
				
				/* Get rid of the first char */
				stripIgnoreFirstChars(word);

				/* Translate chars */
				TranslateChars(word);
				
				if (applyStemmingRules)
				{
					/* apply stemming algorithm to the word to index */
					Stem(word,lenword);
				}

                                if (applySoundexRules)
                                {
                                        /* apply soundex algorithm to the search term */
                                        soundex(word);
                                }


				/* Sorry, have to do isokword() twice to filter out converted strings! */
				
				if (hasokchars(word))
				{
					if (isokword(word))
					{
						#ifdef DEBUG
						printf("	%s %d\n", word, structure);
						#endif
						if(!currentmetanames) {
							entrylist = (struct entryarray *) addentry(entrylist, word, filenum, emphasized,  structure, metaName[0], positionMeta[0]);
							positionMeta[0]++;
						} else {
							for(i=0;i<currentmetanames;i++) { entrylist=(struct entryarray *)addentry(entrylist, word, filenum, emphasized,  structure, metaName[i], positionMeta[i]); positionMeta[i]++;}
						}
						ftotalwords++;
					} 
					else
					{
						if ((int)strlen(word) <minwordlimit && !isstopword(word)) 
						{
							addStopList(word);
							addstophash(word);
						}
					}
				}
				/* Move word position if it is not a space 
				if (!isspace((int)c)) position++; */
				inword = 0;
			}
			else {
				if (i == lenword){
					lenword *=2;
					word = realloc(word,lenword+1);
				}
				word[i++] = c;
			}
		}
		if (c == '<' && !INDEXTAGS) {
			j = 0;
			while ((c = vgetc(vp)) != EOF) {
				tag[j++] = c;
				if (j == lentag) {
					lentag *= 2;
					tag = (char *) erealloc(tag,lentag+1);
				}
				if (c == '>' && notEscaped(tag,j) ) {
					if (j)
						tag[--j] = '\0';
					else
						tag[j] = '\0';
#ifdef DEBUG
					printf("t: %s\n", tag);
#endif
					structure = getstructure(tag,structure);
#ifdef DEBUG
					printf("s: %d\n", structure);
#endif
					if ((tag[0]=='!') && 
						lstrstr(tag,"META") && 
						(lstrstr(tag,"START") ||
						lstrstr(tag,"END") ) ) 
					{
						if (lstrstr(tag, "START")) 
						{
							metaNameOld=getMeta(tag, &docPropName);
							/* If there is not a legal metaName
							** the program quits into getMeta
							*/
							if(metaNameOld!=1) {
								if(currentmetanames==metaNamelen) {metaName=(int *) erealloc(metaName,(metaNamelen *=2) *sizeof(int));positionMeta=(int *) erealloc(positionMeta,metaNamelen*sizeof(int));}
		
								metaName[currentmetanames]=metaNameOld;
								/* Preserve position */
								if(!currentmetanames) tmpposition=positionMeta[0];
								positionMeta[currentmetanames++] = 1;
								/* If it is also a property doc
								** store it
								** Only store until a < is
								** found */
								if(docPropName){
								  tmpfilepos=vtell(vp);
								  for(i=0;(cP=vgetc(vp))!=EOF;i++) {
							            if(i==lenprop) prop=erealloc(prop,(lenprop*=2)+1);
								    if(cP=='<')break;
								    prop[i]=(char)cP;
								  }
								  prop[i]='\0';
								  vseek(vp,tmpfilepos);
								  addDocProperty(&thisFileEntry->docProperties,docPropName,prop);
								}

							}
						}
						else if (lstrstr(tag, "END") ) 
						{
                                                /* Search for MetaName */
						   if(currentmetanames) {
							currentmetanames--;
						        if(!currentmetanames) {
						           metaName[0] = 1;
							/* Restore position counter */
							   positionMeta[0] = tmpposition;
						        }
						   }
						}
					}
					else if ( (tag[0] != '!') &&
						(lstrstr(tag, "META")) &&
						(lstrstr(tag,"NAME")) &&
						(lstrstr(tag,"CONTENT")) )
					{
						ftotalwords += parseMetaData(tag, filenum, structure, thisFileEntry);
					}
						/* Check for XML style */

					else if ( (tag[0] != '!') && ((metaNameXML=getMetaXML(tag,&docPropName) )!=1) )
					{
						if(currentmetanames==metaNamelen) {metaName=(int *) erealloc(metaName,(metaNamelen *=2) *sizeof(int));positionMeta=(int *) erealloc(positionMeta,metaNamelen*sizeof(int));}
						metaName[currentmetanames]=metaNameXML;
							/* Preserve position counter */
						if(!currentmetanames) tmpposition=positionMeta[0];
						positionMeta[currentmetanames++] = 1;
						/* If it is also a property doc
						** store it
						** Only store until a < is 
						** found */
						if(docPropName) {
						  tmpfilepos=vtell(vp);
						  for(i=0;(cP=vgetc(vp))!=EOF;i++) {
						    if(i==lenprop) prop=erealloc(prop,(lenprop*=2)+1);
						    if(cP=='<') break;
                                                    prop[i]=(char)cP;
						  }
						  prop[i]='\0';
						  vseek(vp,tmpfilepos);
						  addDocProperty(&thisFileEntry->docProperties,docPropName,prop);
						} 
					}
					else if ( (tag[0] == '/') && ((metaNameXML=getMetaXML(tag+1,&docPropName))!=1) )
					{
						/* Search for MetaName */
						if(currentmetanames) {
						    for(i=currentmetanames-1;i>=0;i--) if(metaName[i]==metaNameXML) break;
							if(i>=0) currentmetanames=i;
							if(!currentmetanames) {
							    metaName[0] = 1;
							/* Restore position counter */
							    positionMeta[0] = tmpposition;
							}
						}	
					}
					else if ( (tag[0] == '!') && indexComments)
					{
						ftotalwords += parsecomment(tag, filenum, structure,1, &positionMeta[0]);
					}
					
					if ((structure & IN_HEADER) ||
						(structure & IN_TITLE))
						emphasized = 5;
					else
						emphasized = 0;
					
					break;
				}
			}
		}
	}
	addtofwordtotals(filenum, ftotalwords);
	return ftotalwords;
}

/* Indexes the words in a string, such as a file name or an
** HTML title.
*/

int countwordstr(s, filenum, emphasized)
char *s;
int filenum;
int emphasized;
{
	int i, j, inword, wordcount, ilen;
	char c;
	static int lenword=0;
	static char *word=NULL;
	static int lentmpstr=0;
	static char *tmpstr=NULL;
	int position=1;    /* Position of word */
	
	i=0;
	if(!lenword) word = (char *) emalloc((lenword=MAXWORDLEN) + 1);
	if(!lentmpstr) tmpstr = (char *) emalloc((lentmpstr=MAXFILELEN) + 1);
	
	ilen=strlen(s);
	if((ilen+1)>=lentmpstr) {
		lentmpstr=ilen+1+200;
		tmpstr=erealloc(tmpstr,lentmpstr+1);
	}
	memcpy(tmpstr,s,ilen);
	tmpstr[ilen]=' ';
	tmpstr[ilen+1]='\0';
	for (j = inword = wordcount = 0; (c = tmpstr[j]) != '\0'; j++) {
		if (!inword) {
			if (iswordchar(c)) {
				i = 0;
				word[i++] = c;
				inword = 1;
			}
				/* Move word position if it is not a space 
			else if (!isspace((int)c)) position++; */
		}
		else {
			if (!iswordchar(c)) {
				wordcount++;
				if (i == lenword) {
					lenword *=2;
					word = realloc(word,lenword + 1);
				}
				word[i] = '\0';
				
				/* Same as above, move after convertentities
				for (i = 0; word[i]; i++)
				word[i] = tolower(word[i]);
				*/
				
				if (isokword(word))
					{ word=SafeStrCopy(word, (char *) convertentities(word),&lenword); }
				
				/* Ok to go lower now */
				for (i = 0; word[i]; i++)
					word[i] = tolower(word[i]);
				
				/* Get rid of specified last char's */
				stripIgnoreLastChars (word);
				
				/* Get rid of the first char */
				stripIgnoreFirstChars(word);
				
				/* Translate chars */	
				TranslateChars(word);
				
				if (hasokchars(word))
				{
					if (isokword(word)) {
						entrylist = (struct entryarray *) addentry(entrylist, word, filenum, emphasized, IN_FILE, 1, position);
						position++;
					}
					else
					{
						if ((int)strlen(word) <minwordlimit && !isstopword(word)) 
						{
							addStopList(word);
							addstophash(word);
						}
					}
				}
				/* Move word position if it is not a space 
				if (!isspace((int)c)) position++; */
				inword = 0;
			}
			else {
				if (i == lenword) {
					lenword *=2;
					word = realloc(word,lenword + 1);
				}
				word[i++] = c;
			}
		}
	}
	
	return wordcount;
}

/* This returns the value corresponding to the HTML structures
** a word is in.
*/

int getstructure(tag, structure)
char *tag;
int structure;
{

        /* int len; */ /* not used - 2/22/00 */
        char oldChar = 0;
        char* endOfTag = NULL;
        char* pos;

	pos = tag;
        while (*pos)
	  {
                if (isspace((int)*pos))
		  {
                        endOfTag = pos; /* remember where we are... */
                        oldChar = *pos; /* ...and what we saw */
                        *pos = '\0';    /* truncate string, for now */
		      }
		else 
		  pos++;
	      }
       /*      Store Word Context
       **      Modified DLN 1999-10-24 - Comments and Cleaning
       **  TODO: Make sure that these allow for HTML attributes
       **/

       /* HEAD  */
        if (strcasecmp(tag, "/head") == 0)
                structure &= ~IN_HEAD;                        /* Out  */
        else if (strcasecmp(tag, "head") == 0)
                structure |= IN_HEAD;                 /* In  */
       /* TITLE  */
        else if (strcasecmp(tag, "/title") == 0)
                structure &= ~IN_TITLE;
        else if (strcasecmp(tag, "title") == 0)
                structure |= IN_TITLE;
	/* BODY */
        else if (strcasecmp(tag, "/body") == 0)
                structure &= ~IN_BODY;			/* In */
        else if (strcasecmp(tag, "body") == 0)
                structure |= IN_BODY;			/* Out */
	/* H1, H2, H3, H4, H5, H6  */
        else if (tag[0] == '/' && tolower(tag[1]) == 'h' && isdigit((int)tag[2])) /* cast to int - 2/22/00 */
                structure &= ~IN_HEADER;              /* In */
        else if (tolower(tag[0]) == 'h' && isdigit((int)tag[1])) /* cast to int - 2/22/00 */
                structure |= IN_HEADER;			/* Out */
	/* EM, STRONG  */
        else if ((strcasecmp(tag, "/em") == 0) || (strcasecmp(tag, "/strong") == 0))
                structure &= ~IN_EMPHASIZED; 		/* Out */
        else if ((strcasecmp(tag, "em") == 0) || (strcasecmp(tag, "strong") == 0))
                structure |= IN_EMPHASIZED;		/* In */
	/* B, I are seperate for semantics  */
        else if ((strcasecmp(tag, "/b") == 0) || (strcasecmp(tag, "/i") == 0))
                structure &= ~IN_EMPHASIZED;		/* Out */
        else if ((strcasecmp(tag, "b") == 0) || (strcasecmp(tag, "i") == 0))
                structure |= IN_EMPHASIZED;		/* In */
	/* The End  */	

        if (endOfTag != NULL)
	  {
                *endOfTag = oldChar;
	      }
        return structure;
}

/* Parses the words in a comment.
*/

int parsecomment(tag, filenum, structure, metaName, position)
char *tag;
int filenum;
int structure;
int metaName;
int *position;
{
	int i, j, inword, wordcount, emphasized;
	char c;
	static int lenword=0;
	static char *word=NULL;
	
	i=0;
	if(!lenword) word = (char *)emalloc((lenword=MAXWORDLEN) +1);
	
	if (EMPHASIZECOMMENTS)
		emphasized = 5;
	else
		emphasized = 0;
	structure |= IN_COMMENTS;
	for (j = 1, inword = wordcount = 0; (c = tag[j]) != '\0'; j++) {
		if (!inword) {
			if (iswordchar(c)) {
				i = 0;
				word[i++] = c;
				inword = 1;
			}
		}
		else {
			if (!iswordchar(c)) {
				wordcount++;
				if (i == lenword) {	
					lenword *= 2;
					word =realloc(word,lenword +1);
				}
				word[i] = '\0';
				for (i = 0; word[i]; i++)
					word[i] = tolower(word[i]);
				if (isokword(word))
					{ word = SafeStrCopy(word, (char *) convertentities(word),&lenword); }
                                /* Get rid of the last char's */
                                stripIgnoreLastChars(word);

                                /* Get rid of the first char */
                                stripIgnoreFirstChars(word);

				/* Translate chars */
				TranslateChars(word);

				if (applyStemmingRules)
				{
					/* apply stemming algorithm to the word to index */
					Stem(word,lenword);
				}
                                if (applySoundexRules)
                         	{
                                   /* apply soundex algorithm to the search term */
                                      	soundex(word);
                                }
				if (hasokchars(word))
				{
					if (isokword(word))
					{
						entrylist = (struct entryarray *) addentry(entrylist, word, filenum, emphasized, structure, metaName, *position);
						(*position)++;
					}
					else
					{
						if ((int)strlen(word) <minwordlimit && !isstopword(word)) 
						{
							addStopList(word);
							addstophash(word);
						}
					}
				}
				/* Move word position if it is not a space 
				if (!isspace((int)c)) (*position)++;*/
				inword = 0;
			}
			else {
				if (i == lenword) {	
					lenword *= 2;
					word =realloc(word,lenword +1);
				}
				word[i++] = c;
			}
		}
	}
	
	return wordcount;
}

/* Removes words that occur in over _plimit_ percent of the files and
** that occur in over _flimit_ files (marks them as stopwords, that is).
*/
/* 05/00 Jose Ruiz
** Recompute positions when a stopword is removed from lists
** This piece of code is terrorific because the first goal
** was getting the best possible performace. So, the code is not
** very clear.
** The main problem is to recalculate word positions for all
** the words after removing the automatic stop words. This means
** looking at all word's positions for each automatic stop word
** and decrement its position
*/
int removestops(ep, totalFiles, plimit, flimit)
struct entryarray *ep;
int totalFiles;
int plimit;
int flimit;
{
	int i, j, k, l, percent, wordfilecount, stopwords, stoppos, res;
	struct location *lp, *tmplp, *lpstop;
	struct entry *e;
	struct entry **estop=NULL;
	int estopsz=0, estopmsz=0;
	int hashval;
	struct swline *sp;

        /* Now let's count the number of stopwords!!
        */
        for (stopwords=0,hashval = 0; hashval < HASHSIZE; hashval++) {
                sp = hashstoplist[hashval];
                while (sp != NULL) {
                        stopwords++;
                        sp = sp->next;
                }
        }

	if(!ep || !ep->currentsize || plimit>=100) return stopwords;
	
	if(!estopmsz) {
		estopmsz=1;
		estop=(struct entry **)emalloc(estopmsz*sizeof(struct entry *));
	}
		/* this is the easy part: Remove the automatic stopwords from
		** the array */
	for(i=0; i<ep->currentsize; ) {
		lp = ep->elist[i]->locationlist;
		wordfilecount = 0;
		while (lp != NULL) {
			wordfilecount++;
			lp = lp->next;
		}
		percent = (int) (((float) wordfilecount / (float) totalFiles) * 100.0f);
		if (percent >= plimit && wordfilecount >= flimit) {
			addStopList(ep->elist[i]->word);
			addstophash(ep->elist[i]->word);
			stopwords++;
			e = ep->elist[i];
				/* Remove entry from array */
			for(j=i+1;j<ep->currentsize;j++) ep->elist[j-1]=ep->elist[j];
			ep->currentsize--;
			if(estopsz==estopmsz) {  /* More memory? */
				estopmsz*=2;
				estop=(struct entry **)erealloc(estop,estopmsz*sizeof(struct entry *));
			}
				/* estop is an array for storing the
				** automatic stopwords */
			estop[estopsz++]=e;
		}
		else i++;
	}
		/* If we have automatic stopwords we have to recalculate
		** word positions */
	if(estopsz)
	{
				/* Now we need to recalculate all positions
				** of words because we have removed the
				** word in the index array */
				/* Sorry for the code but it is the fastest
				** I could achieve!! */
		for(i=0;i<estopsz;i++) {
			e=estop[i];
			for(j=0;j<ep->currentsize;j++) {
				lp = ep->elist[j]->locationlist;
				lpstop = e->locationlist;
				while(lp) {
					while (lpstop) {
						res=lp->filenum-lpstop->filenum;
						if(res<0) break;
						if(res==0) {
						   res=lp->metaName-lpstop->metaName;
						   if(res<0) break;
						   if(res==0)
						      for(k=lpstop->frequency;k;) 
						         for(stoppos=lpstop->position[--k],l=lp->frequency;l;) {if(lp->position[--l]>stoppos) lp->position[l]--; else break;}
						}
						lpstop = lpstop->next;			
					}
					lp = lp->next;
				}
			}
				/* Free Memory used by stopword */
			efree(e->word);
			lp = e->locationlist;
			while (lp != NULL) {
				tmplp = lp->next;
				if(lp->frequency && lp->position) efree(lp->position);
				efree(lp);
				lp = tmplp;
			}
		}
	}
	efree(estop);
	return stopwords;
}

/* This is somewhat similar to the rank calculation algorithm
** from WAIS (I think). Any suggestions for improvements?
** Note that ranks can't be smaller than 1, emphasized words
** (words in titles, headers) have ranks multiplied by at least 5
** (just a guess), and ranks divisible by 128 are bumped up by one
** (to make the compression scheme with '\0' as a line delimiter
** work). Fudging with the ranks doesn't seem to make much difference.
*/

int getrank(freq, tfreq, words, emphasized)
int freq;
int tfreq;
int words;
int emphasized;
{
	double d, e, f;
	int tmprank;

	if (freq < 5)
		freq = 5;
	d = 1.0 / (double) tfreq;
	e = (log((double) freq) + 10.0) * d;
	if (!ignoreTotalWordCountWhenRanking)
	{
		e /= words;
	}
	else
	{
		/* scale the rank down a bit. a larger has the effect of
		   making small differences in work frequency wash out */ 
		e /= 100;
	}
	f = e * 10000.0;
	
	tmprank = (int) f;
	if (tmprank <= 0)
		tmprank = 1;
	if (emphasized)
		tmprank *= emphasized;
	if (!(tmprank % 128))
		tmprank++;
	
	return tmprank;
}

/* Prints the index information at the head of index files.
** 06/00 - If fp==stdout ptints the header to screen
*/

void printheader(fp, filename, totalwords, totalFiles, merged)
FILE *fp;
char *filename;
int totalwords;
int totalFiles;
int merged;
{
	char *c;
	
	c = (char *) strrchr(filename, '/');

	if(fp!=stdout)
		fprintf(fp, "%s\n", INDEXHEADER);
	fprintf(fp, "%s\n", INDEXVERSION);
	fprintf(fp, "# %s\n", (merged) ? "MERGED INDEX" : "");
	fprintf(fp, "%s %s\n", NAMEHEADER, (indexn[0] == '\0') ? "(no name)" :
	indexn);
	fprintf(fp, "%s %s\n", SAVEDASHEADER, (c == NULL && c + 1 != '\0') ? filename : c + 1);
	fprintf(fp, "%s ", COUNTSHEADER);
	if (totalwords)
		fprintf(fp, "%d words%s", totalwords, (totalFiles) ? ", " : "");
	if (totalFiles)
		fprintf(fp, "%d files", totalFiles);
	fprintf(fp, "\n");
	if (fp==stdout)
		fprintf(fp, "%s %s\n", INDEXEDONHEADER,indexedon);
	else
		fprintf(fp, "%s %s\n", INDEXEDONHEADER,getthedate());
	fprintf(fp, "%s %s\n", DESCRIPTIONHEADER,(indexd[0] == '\0') ?
		"(no description)" : indexd);
	fprintf(fp, "%s %s\n", POINTERHEADER,(indexp[0] == '\0') ?
		"(no pointer)" : indexp);
	fprintf(fp, "%s %s\n",MAINTAINEDBYHEADER, (indexa[0] == '\0') ?
		"(no maintainer)" : indexa);
	fprintf(fp, "%s %s\n", DOCPROPENHEADER, "Enabled");
	fprintf(fp, "%s %d\n", STEMMINGHEADER, applyStemmingRules);
        fprintf(fp, "%s %d\n", SOUNDEXHEADER, applySoundexRules);
        fprintf(fp, "%s %s\n", WORDCHARSHEADER, wordchars);
        fprintf(fp, "%s %d\n", MINWORDLIMHEADER, minwordlimit);
        fprintf(fp, "%s %d\n", MAXWORDLIMHEADER, maxwordlimit);
        fprintf(fp, "%s %s\n", BEGINCHARSHEADER, beginchars);
        fprintf(fp, "%s %s\n", ENDCHARSHEADER, endchars);
        fprintf(fp, "%s %s\n", IGNOREFIRSTCHARHEADER, ignorefirstchar);
        fprintf(fp, "%s %s\n", IGNORELASTCHARHEADER, ignorelastchar);
		/* Jose Ruiz 06/00 Added this line to delimite the header */
	if(fp!=stdout) fputc(0,fp);    
}

char* getFileNameByFileNum(int filenum)
{
	/* for diagnostics only */
	struct file *filep = filelist;
	while ((filep != NULL) && --filenum)
	{
		filep = filep->next;
	}

	if (filep != NULL)
		return filep->filename;
	else
		return "";
}

/* Sort entry by MetaName, FileNum */
void sortentry(e)
struct entry *e;
{
int i, j;
unsigned char *ptmp,*ptmp2;
int *pi;
struct location *rp, *pv;
struct location *rtmp;
                      /* Very trivial case */
                if(!e) return;
		rp=e->locationlist;
                        /* Compute number of occurrences */
                for(i=0,rtmp=rp;rtmp;rtmp = rtmp->next,i++);
                      /* Another very trivial case */
                if (!i) return;
                        /* Compute array wide */
                j=2*sizeof(int)+sizeof(void *);
                        /* Compute array size */
                ptmp=(void *)emalloc(j*i);
                        /* Build an array with the elements to compare
                                 and pointers to data */
                for(ptmp2=ptmp,rtmp=rp;rtmp;rtmp = rtmp->next) {
                        pi=(int *)ptmp2;
                        pi[0] = rtmp->metaName;
                        pi[1] = rtmp->filenum;
                        ptmp2+=2*sizeof(int);
                        memcpy((char *)ptmp2,(char *)&rtmp,sizeof(struct result *));
                        ptmp2+=sizeof(void *);
		}
                        /* Sort them */
                qsort(ptmp,i,j,&icomp2);
			/* Store results */
                for(j=0,rp=NULL,ptmp2=ptmp;j<i;j++){
                        pi=(int *)ptmp2;
                        ptmp2+=2*sizeof(int);
                        memcpy((char *)&pv,(char*)ptmp2,sizeof(struct result *));
                        ptmp2+=sizeof(void *);
			pv->next=NULL;
			if(rp) 
				rp->next=pv;
			else
				e->locationlist=pv;
			rp=pv;
                }
                        /* Free the memory od the array */
                efree(ptmp);

}

/* Print the index entries that hold the word, rank, and other information.
*/

void printindex(ep, fp)
struct entryarray *ep;
FILE *fp;
{
int i;
	if(ep)
	for(i=0; i<ep->currentsize; i++) 
		if (!isstopword(ep->elist[i]->word)) {
			/* Sort locationlist by MetaName
			** for faster search */
			sortentry(ep->elist[i]);
			printentry(ep->elist[i],fp);
		}
}

/* Jose Ruiz 04/00
** Function to write an entry to the index file
*/
void printentry(ep,fp)
struct entry *ep;
FILE *fp;
{
	int i,j,wordlen,curmetaname;
	struct location *lp;
	long tmp,curmetanamepos;

	curmetaname=0;
	curmetanamepos=0L;
	ep->fileoffset = ftell(fp);
	for (i = 0; indexchars[i] != '\0'; i++)
		if ((ep->word)[0] == indexchars[i] &&
		!offsets[i])
	offsets[i] = ep->fileoffset;
		
		/* Get HashOffset for direct access */
	for (i = 0; i<SEARCHHASHSIZE; i++)
		if (hashentries[i] == ep) {
			hashoffsets[i] = ep->fileoffset;
			break;
		}
	
		/* Write word length, word and a NULL offset */
	wordlen=strlen(ep->word);
	compress(wordlen,fp);
	fwrite(ep->word, wordlen, 1, fp);
	printlong(fp,(long)0);

		/* Write location list */
	lp = ep->locationlist;
	while (lp != NULL) {
		int totalWords;
		totalWords = gettotalwords(lp->filenum);
			/* If rank is not computed do it now */
			/* When calling from merge it is computed */
		if(!lp->rank) lp->rank = getrank(lp->frequency, ep->tfrequency,totalWords,lp->emphasized);
		if (verbose == 4)
		{
			printf("%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",getFileNameByFileNum(lp->filenum), ep->word, lp->emphasized, lp->rank, lp->frequency, ep->tfrequency, totalWords, lp->structure, lp->metaName);
		}
		if(curmetaname!=lp->metaName) {
			if(curmetaname) {
				/* Write in previous metaname (curmetaname)
				** file offset to next metaname */
				tmp=ftell(fp);
				fseek(fp,curmetanamepos,0);
				printlong(fp,tmp);
				fseek(fp,tmp,0);
			}
			curmetaname=lp->metaName;
			compress(curmetaname,fp);
			curmetanamepos=ftell(fp);
			printlong(fp,(long)0);
		}
		compress(lp->filenum, fp);
		compress(lp->rank, fp);
		compress(lp->structure, fp);
		/* Writing position information to index file */
		compress(lp->frequency,fp);
		for(j=0;j<lp->frequency;j++) compress(lp->position[j],fp);
		/* End of writing position information to index file */
		lp = lp->next;
	}
	/* Write in previous metaname (curmetaname)
	** file offset to end of metanames */
	tmp=ftell(fp);
	fseek(fp,curmetanamepos,0);
	printlong(fp,tmp);
	fseek(fp,tmp,0);
		/* A NULL byte to indicate end of word data */
	fputc(0, fp);
}

/* Prints the list of stopwords into the index file.
*/

void printstopwords(fp)
FILE *fp;
{
	int hashval,len;
	struct swline *sp;
	
	offsets[STOPWORDPOS] = ftell(fp);
	for (hashval = 0; hashval < HASHSIZE; hashval++) {
		sp = hashstoplist[hashval];
		while (sp != NULL) {
			len=strlen(sp->line);
			compress(len,fp);
			fwrite(sp->line, len, 1, fp);
			sp = sp->next;
		}
	}
	fputc(0,fp);
}

void writeFileEntry(filename, title, size, fp, docProperties)
char *filename;
char *title;
int size;     
FILE *fp;
struct docPropertyEntry **docProperties;
{
int len;
char *p;
	p = ruleparse(filename);
	len = strlen(p);
	compress(len,fp);
	fwrite(p,len,1,fp);
	len = strlen(title);
	compress(len,fp);
	fwrite(title,len,1,fp);
	len = size +1;    /* We store size + 1 to avoid problems with
			  * files with size 0 */
	compress(len,fp);
	storeDocProperties(*docProperties, fp);
}

void readFileEntry(fp, filename, title, size, docProperties)
FILE *fp;
char **filename;
char **title;
int *size;
struct docPropertyEntry **docProperties;
{
int len1,len2,bytes;
static int lenbuf1=0;
static int lenbuf2=0;
static char *buf1=NULL;
static char *buf2=NULL;
	if(!lenbuf1) buf1=emalloc((lenbuf1=MAXSTRLEN)+1);
	if(!lenbuf2) buf2=emalloc((lenbuf2=MAXSTRLEN)+1);
	uncompress(len1,fp);   /* Read length of filename */
	if(len1>=lenbuf1) {
		lenbuf1 = len1 +200;
		buf1 = erealloc(buf1,lenbuf1+1);
	}
	fread(buf1,len1,1,fp);   /* Read filename */
	buf1[len1] = '\0';
	uncompress(len2,fp);   /* Read length of title */
	if(len2>=lenbuf2) {
		lenbuf2 = len2 +200;
		buf2 = erealloc(buf2,lenbuf2+1);
	}
	fread(buf2,len2,1,fp);     /* Read title */
	buf2[len2] = '\0';
	uncompress(bytes,fp);           /* Read size */
	bytes--;

	if (docProperties != NULL)
		*docProperties = NULL;

	/* read (or skip over) the document properties section  */
	fetchDocProperties(docProperties, fp);
	*filename = buf1;
	*title = buf2;
	*size = bytes;
}

/* Prints the list of files, titles, and sizes into the index file.
*/

void printfilelist(filep, fp)
struct file *filep;
FILE *fp;
{
	int i;
	
	i = 0;
	offsets[FILELISTPOS] = ftell(fp);
	while (filep != NULL) 
	{
		int filenum = i++;
		addtofilehashlist(filenum, ftell(fp));
		writeFileEntry(filep->filename, filep->title, filep->size, fp, &filep->docProperties);
		freeDocProperties(&filep->docProperties);
		filep = filep->next;
	}
}

/* Prints the list of metaNames into the file index
*/

void printMetaNames(fp)
FILE *fp;
{
	struct metaEntry* entry;
	int len, style;
	
	offsets[METANAMEPOS] = ftell(fp);
	for (entry = metaEntryList; entry; entry = entry->next)
    {
		len = strlen(entry->metaName);
		compress(len,fp);
		fwrite(entry->metaName,len,1,fp);
		if (entry->isDocProperty)
		{
			/* write the meta name style:
			 * <name>"0   -> normal meta name [default, so does not have to be written]
			 * <name>"1   -> doc property name
			 * <name>"2   -> both
			 */
			/* Add one to compress non 0 value */
			style = (entry->isOnlyDocProperty) ? 2 : 3;
		} else style=1;
		compress(style,fp);
    }
	fputc(0,fp);
}

/* Prints the list of file offsets into the index file.
 */

void printfileoffsets(fp)
FILE *fp;
{
	int i;
	
	offsets[FILEOFFSETPOS] = ftell(fp);
	for (i = 0; getfilenum(i) != 0; i++)
		printlong(fp, getfilenum(i));
	printlong(fp, (long)0);
}

/* Prints out the decompressed values in an index file.*/

void decompress(fp)
FILE *fp;
{
	int i, c, x, wordlen, fieldnum, frequency, metaname;
	long pos;
	long num;
	long nextposmetaname;

	metaname=0;
	nextposmetaname=0L;
	
	frequency=0;

	readoffsets(fp);

	if (verbose == 4)
	{
		readfileoffsets(fp);
	}

	fseek(fp, 0, 0);
	readheader(fp);

	fieldnum = 0;
	
	while (1) {
		c = fgetc(fp);
		ungetc(c, fp);
		if (c == '#') {
			for(c=fgetc(fp);c!=EOF && c!='\n';c=fgetc(fp))putchar(c);
			putchar((int)'\n');
			continue;
		}
		else {
			c=fgetc(fp);  /* Jump null delimiter */
			printf("\n----> OFFSETS INFO. Hexadecimal Numbers <----\n");
			for(i=0;i<MAXCHARS;i++) {
				num=readlong(fp);
				printf("%04lx ", num);
			}
			fgetc(fp);   /*jump '\n' */
			printf("\n----> HASH OFFSETS INFO. Hexadecimal Numbers <----\n");
			for(i=0;i<SEARCHHASHSIZE;i++) {
				num=readlong(fp);
				printf("%04lx ", num);
			}
			fgetc(fp);   /*jump '\n' */
			break;
		}
	}
	printf("\n-----> WORD INFO <-----\n");
			/* Decode word Info */	
	if (ftell(fp) != offsets[STOPWORDPOS]) {
		uncompress(wordlen,fp);
		while (wordlen) 
		{
			for(i=0; i<wordlen; i++) putchar(fgetc(fp));
			putchar((int)':');
				/* Jump offset hash link */
			readlong(fp);
			uncompress(x,fp);  /* metaname */
			if((metaname=x)) {
				nextposmetaname=readlong(fp);
				uncompress(x,fp);  /* First file */
			}
			while(x)
			{
				if (verbose == 4)
				{
					switch (fieldnum) {
						case 0:
						{
							char* filename;
							char* junk;
							printf(" Meta:%d", metaname);
							pos = ftell(fp);
							filename = lookupfile(x, fp, NULL);
							junk = strchr(filename, '\"');
							*(junk-1) = '\0';	
							printf(" %s", filename);
							fseek(fp, pos, 0);
					        }
						break;
						case 1: printf(" Rank:%d", x);break;
						case 2: printf(" Strct:%d", x);break;
						case 3: printf(" Freq:%d", x);break;
						case 4: printf(" Pos:%d", x);break;
						default: printf(",%d",x);break;
				 	}	
				}
				else {
					if(!fieldnum) printf(" %d",metaname);
					printf(" %d", x);
				}
				fieldnum++; 
				if (fieldnum >= 4) {
					if (fieldnum == 4) frequency = x;
					else frequency--;
					if (!frequency) fieldnum = 0;
				}
				if(ftell(fp) == nextposmetaname) {
					uncompress(x,fp);
					if((metaname=x)) {
						nextposmetaname=readlong(fp);
						uncompress(x,fp);
					} else nextposmetaname=0L;
				} else uncompress(x,fp);
			}
			putchar((int)'\n');
			if (ftell(fp) == offsets[STOPWORDPOS]) break;
			uncompress(wordlen,fp);
		}
	} else
		printf("WARNING!! NO unique index words in index file!!\n");

		/* Decode Stop Words: All them are in just one line */
	printf("\n\n-----> STOP WORDS <-----\n");
	uncompress(wordlen,fp);
	while(wordlen) {
		for(i=0;i<wordlen;i++)putchar(fgetc(fp));
		putchar((int)' ');
		uncompress(wordlen,fp);
	}
	putchar((int)'\n');

		/* Decode File Info */
	printf("\n\n-----> FILES <-----\n");
	for (i=1,c=0; c!=EOF; i++)
	{
		if (offsets[FILEOFFSETPOS] == ftell(fp)) {i--;break;}
		uncompress(x,fp); /* length filename */
		for(;x;x--) putchar((int)fgetc(fp));  /* filename */
		putchar(' ');
		putchar('\"');
		uncompress(x,fp); /* length title */
		for(;x;x--) putchar((int)fgetc(fp));  /* title */
		uncompress(x,fp); /* bytes */
		printf("\" %d",x-1);
		/* Print properties */
		uncompress(x,fp); /* prop num */
		while(x) {
			putchar(' ');
			printf("PROP_%d:",(x-1));
			uncompress(x,fp); /* prop length */
			putchar('\"');
			for(;x;x--) putchar((int)fgetc(fp));
			putchar('\"');
			uncompress(x,fp);
		}
		putchar((int)'\n');
	}
	printf("\nNumber of File Entries: %d\n",i);
		/* Jump File Offsets */
	for(c = fgetc(fp); c!=EOF && ftell(fp)!=offsets[METANAMEPOS];c = fgetc(fp));
	if(c!=EOF) {
		/* Meta Names */
		printf("\n\n-----> METANAMES <-----\n");
		uncompress(wordlen,fp);
		while(wordlen) {
			for(i=0;i<wordlen;i++)putchar(fgetc(fp));
			putchar((int)'\"');
			uncompress(i,fp);
			putchar((int)(i-1+(int)'0'));
			putchar((int)' ');
			uncompress(wordlen,fp);
		}
		putchar((int)'\n');
	}

	if (verbose != 4)
	printf("\nUse -v 4 for a more complete info\n");
}

/* Parses lines according to the ReplaceRules directives.
*/

char *ruleparse(line)
char *line;
{
	static int lenrule=0;
	static char *rule=NULL;
	static int lentmpline=0;
	static char *tmpline=NULL;
	static int lennewtmpline=0;
	static char *newtmpline=NULL;
	static int lenline1=0;
	static char *line1=NULL;
	static int lenline2=0;
	static char *line2=NULL;
	struct swline *tmplist;
	int ilen1,ilen2;
	
	if(!lenrule) rule=(char *)emalloc((lenrule=MAXSTRLEN)+1);
	if(!lentmpline) tmpline=(char *)emalloc((lentmpline=MAXSTRLEN)+1);
	if(!lennewtmpline)newtmpline=(char *)emalloc((lennewtmpline=MAXSTRLEN)+1);
	if(!lenline1) line1=(char *)emalloc((lenline1=MAXSTRLEN)+1);
	if(!lenline2) line2=(char *)emalloc((lenline2=MAXSTRLEN)+1);

	if (replacelist == NULL)
		return line;
	
	tmplist = replacelist;
	tmpline = SafeStrCopy(tmpline, line,&lentmpline);
	while (1) 
	{
		if (tmplist == NULL)
			return tmpline;
		rule =SafeStrCopy(rule, tmplist->line,&lenrule);
		tmplist = tmplist->next;
		if (tmplist == NULL)
			return tmpline;
		if (rule == NULL) {
			replacelist = tmplist;
			return tmpline;
		}
		else {
			if (lstrstr(rule, "replace")) {
				line1 = SafeStrCopy(line1, tmplist->line,&lenline1);
				tmplist = tmplist->next;
				if (tmplist)
				{
					line2 = SafeStrCopy(line2, tmplist->line,&lenline2);
					tmplist = tmplist->next;
				}
				else
				{
					/* Handle case where 2nd part of replace rule
					** is an empty string. Config-file parsing
					** idiosyncrasies cause a replace of "x" to ""
					** to incompletely represent the rule.
					*/
					line2[0] = '\0';
				}
				newtmpline=SafeStrCopy(newtmpline, (char *) matchAndChange(tmpline, line1, line2),&lennewtmpline);
			}
			else if (lstrstr(rule, "append")) {
				ilen1=strlen(tmpline);
				ilen2=strlen(tmplist->line);
				if((ilen1+ilen2)>=lennewtmpline) {
					lennewtmpline=ilen1+ilen2+200;
					newtmpline=erealloc(newtmpline,lennewtmpline+1);
				}
				memcpy(newtmpline,tmpline,ilen1);
				memcpy(newtmpline+ilen1,tmplist->line,ilen2);
				newtmpline[ilen1+ilen2]='\0';
				tmplist = tmplist->next;
			}
			else if (lstrstr(rule, "prepend")) {
				ilen1=strlen(tmpline);
				ilen2=strlen(tmplist->line);
				if((ilen1+ilen2)>=lennewtmpline) {
					lennewtmpline=ilen1+ilen2+200;
					newtmpline=erealloc(newtmpline,lennewtmpline+1);
				}
				memcpy(newtmpline,tmplist->line,ilen2);
				memcpy(newtmpline+ilen2,tmpline,ilen1);
				newtmpline[ilen1+ilen2]='\0';
				tmplist = tmplist->next;
			}
			else if (lstrstr(rule,"remove")) {
				newtmpline = SafeStrCopy(newtmpline, (char *)matchAndChange(tmpline,tmplist->line,""),&lennewtmpline);
			}
			tmpline=SafeStrCopy(tmpline, newtmpline,&lentmpline);
		}
	}
}


int getMetaXML(tag, docPropName)
char* tag;
int* docPropName;
{
	char* temp;
	static int lenword=0;
	static char *word=NULL;
	int i;
	struct metaEntry* list;
	
	if(!lenword) word =(char *)emalloc((lenword=MAXWORDLEN)+1);

	if (docPropName != NULL)
	{
		*docPropName = 0;
	}
	
	temp = tag;
	
	/* Get to the beginning of the word disreguarding blanks */
	while (temp != NULL && *temp) {
		if (*temp == ' ')
			temp++;
		else
			break;
	}
	
	/* Copy the word and convert to lowercase */
	for (i=0;temp !=NULL && *temp && *temp != ' '; ) {
		if (i==lenword) {
			lenword *=2;
			word= (char *) erealloc(word,lenword+1);
		}
		word[i] = *temp++;
		word[i] = tolower(word[i]);
		i++;
	}
	if (i==lenword) {
		lenword *=2;
		word= (char *) erealloc(word,lenword+1);
	}
	word[i] = '\0';
	
	while(1) {
		for (list = metaEntryList; list != NULL; list = list->next)
		{
			if (!strcmp(list->metaName, word) )
			{
				if ((docPropName != NULL) && (list->isDocProperty))
				{
					*docPropName = list->index;
				}
				if (list->isOnlyDocProperty)
				{
					if (applyautomaticmetanames) list->isOnlyDocProperty=0;
					else 
				/* property is not for indexing, so return generic metaName value */
						return 1;
				}
				return list->index;
			}
		}
		/* 06/00 Jose Ruiz
		** If automatic MetaNames enabled add the MetaName
		** else break
		*/
		if(applyautomaticmetanames) {
			if (verbose) 
				printf("\nAdding automatic MetaName %s\n",word);
			addMetaEntry(&metaEntryList,word,0); 
		} else break;
	}
	/* If it is ok not to have the name listed, just index as no-name */
	if (OKNOMETA) {
		/*    printf ("\nwarning: metaName %s does not exiest in the user config file", word); */
		return 1;
	}
	else {
		printf ("\nerr: INDEXING FAILURE\n");
		printf ("err: The metaName %s does not exist in the user config file\n", word);
		exit(0);
	}
	
}


/* Get the MetaData index when the whole tag is passed */

/* Patch by Tom Brown */
/* TAB, this routine is/was somewhat pathetic... but it was pathetic in
 1.2.4 too ... someone needed a course in defensive programming... there are
 lots of tests below for temp != NULL, but what is desired is *temp != '\0'
 (e.g. simply *temp) ... I'm going to remove some strncmp(temp,constant,1)
 which are must faster as *temp != constant ...

 Anyhow, the test case I've got that's core dumping is:
    <META content=3D"MSHTML 5.00.2614.3401" name=3DGENERATOR>
 no trailing quote, no trailing space... and with the missing/broken check for+  end of string it scribbles over the stack...

*/

int getMeta(tag, docPropName)
char* tag;
int* docPropName;
{
	char* temp;
	static int lenword=0;
	static char *word=NULL;
	int i;
	struct metaEntry* list;
	
	if(!lenword) word =(char *)emalloc((lenword=MAXWORDLEN)+1);

	if (docPropName != NULL)
	{
		*docPropName = 0;
	}
	
	temp = (char*) lstrstr((char*)tag,(char*) "NAME");
	if (temp == NULL)
		return 1;
	
	temp += strlen("NAME");
	
	/* Get to the '=' sign disreguarding blanks */
	while (temp != NULL && *temp) {
		if (*temp && (*temp != '='))  /* TAB */
			temp++;
		else {
			temp++;
			break;
		}
	}
	
	/* Get to the beginning of the word disreguarding blanks and quotes */
	/* TAB */
	while (temp != NULL && *temp) {
		if (*temp == ' ' || *temp == '"' )
			temp++;
		else
			break;
	}
	
	/* Copy the word and convert to lowercase */
	/* TAB */
	/* while (temp !=NULL && strncmp(temp," ",1) */
	/*	&& strncmp(temp,"\"",1) && i<= MAXWORDLEN ) { */

	/* and the above <= was wrong, should be < which caused the
	   null insertion below to be off by two bytes */

	for (i=0;temp !=NULL && *temp && *temp != ' '
		&& *temp != '"' ;) {
		if (i==lenword) {
			lenword *=2;
			word= (char *) erealloc(word,lenword+1);
		}
		word[i] = *temp++;
		word[i] = tolower(word[i]);
		i++;
	}
	if (i==lenword) {
		lenword *=2;
		word= (char *) erealloc(word,lenword+1);
	}
	word[i] = '\0';

	while(1) {
		for (list = metaEntryList; list != NULL; list = list->next)
		{
			if (!strcmp(list->metaName, word) )
			{
				if ((docPropName != NULL) && (list->isDocProperty))
				{
					*docPropName = list->index;
				}
				if (list->isOnlyDocProperty)
				{
					if (applyautomaticmetanames) list->isOnlyDocProperty=0;
					else 
					/* property is not for indexing, so return generic metaName value */
						return 1;
				}
				return list->index;
			}
		}
		/* 06/00 Jose Ruiz
		** If automatic MetaNames enabled add the metaName
		** else break
		*/
		if(applyautomaticmetanames) {
			if (verbose) 
				printf("\nAdding automatic MetaName %s\n",word);
			addMetaEntry(&metaEntryList,word,0); 
		} else break;
	}
	/* If it is ok not to have the name listed, just index as no-name */
	if (OKNOMETA) {
		/*    printf ("\nwarning: metaName %s does not exiest in the user config file", word); */
		return 1;
	}
	else {
		printf ("\nerr: INDEXING FAILURE\n");
		printf ("err: The metaName %s does not exist in the user config file\n", word);
		exit(0);
	}
	
}

/* Parses the Meta tag */
int parseMetaData(tag, filenum, structure, thisFileEntry)
char* tag;
int filenum;
int structure;
struct file* thisFileEntry;
{
	int metaName, i, j, inword, wordcount, emphasized, jstart;
	char* temp;
	char c;
	static int lenword=0;
	static char *word=NULL;
	int docPropName = 0;
	int position=1; /* position of word */
	
	i=0;
	if(!lenword)word=(char *)emalloc((lenword=MAXWORDLEN)+1);
	wordcount = 0;
	temp = NULL;
	metaName= getMeta(tag, &docPropName);

	/* 10/11/99 - Bill Moseley - don't index meta tags not specified in MetaNames */
	if ( REQMETANAME && metaName == 1 ) return 0;

	temp = (char*) lstrstr((char*) tag,(char*) "CONTENT");
	
	/* if there is no  CONTENT is another tag so just ignore the whole thing
	* the check is done here istead of before because META tags do not have
	* a fixed length that can be checked
	*/
	if (temp != NULL && *temp)
    {
		temp += strlen("CONTENT");
		
		/* Get to the " sign disreguarding blanks */
		while (temp != NULL && *temp) {
			if (*temp != '"')
				temp++;
			else {
				temp++;
				break;
			}
		}
		
		jstart = strlen(tag) - strlen(temp);
		
		if (EMPHASIZECOMMENTS)
			emphasized = 5;
		else
			emphasized = 0;
		
		if (docPropName)
		{
			temp = strchr(tag + jstart, '\"'); /* first quote after start of CONTENT */
			if (temp != NULL)
			{
				*temp = '\0';	/* terminate CONTENT, temporarily */
				addDocProperty(&thisFileEntry->docProperties, docPropName, tag+jstart);
				*temp = '\"';	/* restore string */
			}
		}
		
		for (j = jstart, inword = wordcount = 0;(c = tag[j]) != '\0'; j++) {
			if (!inword) {
				if (iswordchar(c)) {
					i = 0;
					word[i++] = c;
					inword = 1;
				}
				/* Move word position if it is not a space
				else if (!isspace((int)c)) position++; */
			}
			else {
				if (!iswordchar(c)) {
					wordcount++;
					if (i == lenword) {
						lenword *=2;
						word = realloc(word,lenword+1);
					}
					word[i] = '\0';
					for (i = 0; word[i]; i++)
						word[i] = tolower(word[i]);
					if (isokword(word))
						{ word = SafeStrCopy(word, (char *) convertentities(word),&lenword); }
					/* Get rid of the last specified char's */
					stripIgnoreLastChars(word);
					
					/* Get rid of the first char */
					stripIgnoreFirstChars(word);
					
					/* Translate chars */
					TranslateChars(word);

					if (applyStemmingRules)
					{
						/* apply stemming algorithm to the word to index */
						Stem(word,lenword);
					}
                                        if (applySoundexRules)
                                        {
                                                /* apply soundex algorithm to the search term */
                                                soundex(word);
                                        }
					if (hasokchars(word))
					{
						if (isokword(word)) {
							entrylist = (struct entryarray *) addentry(entrylist, word, filenum, emphasized, structure, metaName, position);
							position++;
						}
						else
						{
							if ((int)strlen(word) <minwordlimit && !isstopword(word)) 
							{
								addStopList(word);
								addstophash(word);
							}
						}
					}
				/* Move word position if it is not a space 
					if (!isspace((int)c)) position++; */
					inword = 0;
					if (c == '\"' && tag[j-1] != '\\')
						break;
				}
				else {
					if (i == lenword) {
						lenword *=2;
						word = realloc(word,lenword+1);
					}
					word[i++] = c;
				}
			}
		}
    }
	return wordcount;
}

/*  These 2 routines fix the problem when a word ends with mutiple
**  IGNORELASTCHAR's (eg, qwerty'. ).  The old code correctly deleted
**  the ".", but didn't check if the new last character ("'") is also
**  an ignore character.
*/

void stripIgnoreLastChars(char *word)
{
	int i=strlen(word);
	
	/* Get rid of specified last char's */
	/* for (i=0; word[i] != '\0'; i++); */
	/* Iteratively strip off the last character if it's an ignore character */
	while ( isIgnoreLastChar(word[--i]) )
		word[i] = '\0';
}

/* 06/00 Jose Ruiz ** Obsolete - now is macro with a lookuptable
int isIgnoreLastChar(char c)
{
	int i;
	
	for (i=0; ignorelastchar[i] != '\0'; i++)
	{
		if (c == ignorelastchar[i])
			return 1;
	}
	
	
	return 0;
}
*/

void stripIgnoreFirstChars(char *word)
{
	int j, k;
	int i = 0;
	
	/* Keep going until a char not to ignore is found */
	while ( isIgnoreFirstChar(word[i]) )
		i++;
	
	/* If all the char's are valid, just return */
	if (0 == i)
		return;
	else
    {
		for ( k=i, j=0; word[k] != '\0'; j++,k++)
		{
			word[j] = word[k];
		}
		/* Add the NULL */
		word[j] = '\0';
    }
}


/* 06/00 Jose Ruiz ** Obsolete - now is macro with a lookuptable
int isIgnoreFirstChar(char c)
{
	int i;
	
	for (i=0; ignorefirstchar[i] != '\0'; i++)
		if (c == ignorefirstchar[i])
		return 1;
	
	return 0;
}
*/

int notEscaped( char *tag, int j)
{
	if ( j > 1)
    { 
		if ( tag[j - 2] !=  '\\')
		{ return 1;}
		else
		{ return 0; }
    }
	else
    { return 1; }
}



/* Jose Ruiz 04/00 */
/* Function to build a hash table with all the words for direct access */
void computeHash( ep)
struct entryarray *ep;
{
int i;
unsigned hashval;
	if(ep)
	for(i=0;i<ep->currentsize;i++) {
		if (!isstopword(ep->elist[i]->word)) {
			hashval = searchhash(ep->elist[i]->word);
			if(!hashentries[hashval]) {
				hashentries[hashval] = ep->elist[i];
				ep->elist[i]->nexthash = NULL;
			} else {		
				ep->elist[i]->nexthash = hashentries[hashval];
				hashentries[hashval] = ep->elist[i];
			}
		}
	}
}

/* 
** Jose Ruiz 04/00
** Store a portable long with just four bytes
*/
void printlong(fp, num)
FILE *fp;
long num;
{
int i;
	if(!num) 
		for(i=0;i<MAXLONGLEN;i++) fputc(0,fp);
	else 
		for(i=0;i<MAXLONGLEN;i++,num/=256) 
			fputc((num % 256),fp);
}

/* 
** Jose Ruiz 04/00
** Read a portable long (just four bytes)
*/
long readlong(fp)
FILE *fp;
{
int i, val[MAXLONGLEN];
long num=0;
	for(i=0;i<MAXLONGLEN;i++) 
		val[i]=fgetc(fp);
	for(i=MAXLONGLEN;i>0;){
		num*=256;
		num+=val[--i];
	}
	return(num);
}

/* Jose Ruiz 04/00 */
/* Function to print to the index file the hash table with all the words */
void printhash( fp)
FILE *fp;
{
int i, wordlen;
struct entry *ep, *epn;
	for(i=0; i<SEARCHHASHSIZE; i++) {
		if((ep = hashentries[i])) {
			while(ep) {
				fseek(fp,ep->fileoffset,0);
				uncompress(wordlen,fp);
				fseek(fp,(long)wordlen,SEEK_CUR);
				if((epn = ep->nexthash)) {
					printlong(fp,epn->fileoffset);
					ep = epn;
				} else {
					printlong(fp,(long)0);
					ep = NULL;
				}
			}
		}
	}
}

void TranslateChars(char *s)
{
char *p,*q;
        for(p=s;p;){
                p=strpbrk(p,translatechars1);
                if(p) {
                        q=strchr(translatechars1,p[0]);
                        *p++=translatechars2[q-translatechars1];
                }
        }
}

