#include "udm_config.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>

#include "udm_common.h"
#include "udm_spell.h"
#include "udm_db.h"
#include "udm_hrefs.h"
#include "udm_utils.h"
#include "udm_xmalloc.h"
#include "udm_charset.h"

/* TUNE */
/* Max URLs in cache: 4K URLs will use about 200K of RAM         */
/* This should be a configurable parameter but we'll use 4K now  */

#define MAXHSIZE	1023*4
#define HSIZE		256	/* Length of buffer increment  */
#define RESORT_HREFS	256	/* Max length of unsorted part */


/* Function to sort URLs in alphabetic order */
static int cmphrefs(const void * v1, const void * v2){
	return(strcmp(((const UDM_HREF*)v1)->href,((const UDM_HREF*)v2)->href));
}


__INDLIB__ int UdmAddHref(UDM_ENV * Conf,char *href,int referrer,int hops,int stored, char * tag,char * cat){
	int l,r,c,res;
	size_t i,len;
	char ehref[UDM_URLSIZE];

	/* Don't add empty or too long link */
	len=strlen(href);
	if((len<1)||(len>UDM_URLSIZE-1))return(0);

	strcpy(ehref,href);
	UdmTrim(ehref," \t\r\n");
	UdmStrRemoveChars(ehref,"\t\r\n");
	UdmUnescapeSgmlStr(ehref);

	/* Find current URL in sorted part of list */
	l=0;r=Conf->shrefs-1;
	while(l<=r){
		c=(l+r)/2;
		if(!(res=strcmp(Conf->Href[c].href,ehref))){
			Conf->Href[c].stored|=stored;
			return(0);
		}
		if(res<0)
			l=c+1;
		else
			r=c-1;
	}
	/* Find in unsorted part */
	for(i=Conf->shrefs;i<Conf->nhrefs;i++){
		if(!strcmp(Conf->Href[i].href,ehref)){
			Conf->Href[i].stored|=stored;
			return(0);
		}
	}
	if(Conf->nhrefs>=Conf->mhrefs){
		if(Conf->mhrefs){
			Conf->mhrefs+=HSIZE;
			Conf->Href=(UDM_HREF *)UdmXrealloc(Conf->Href,Conf->mhrefs*sizeof(UDM_HREF));
		}else{
			Conf->mhrefs=HSIZE;
			Conf->Href=(UDM_HREF *)UdmXmalloc(Conf->mhrefs*sizeof(UDM_HREF));
		}
	}
	Conf->Href[Conf->nhrefs].href=strdup(ehref);
	Conf->Href[Conf->nhrefs].referrer=referrer;
	Conf->Href[Conf->nhrefs].hops=hops;
	Conf->Href[Conf->nhrefs].stored=stored;
	Conf->Href[Conf->nhrefs].tag=tag?strdup(tag):NULL;
	Conf->Href[Conf->nhrefs].cat=cat?strdup(cat):NULL;
	Conf->nhrefs++;

	/* Sort unsorted part */
	if((Conf->nhrefs-Conf->shrefs)>RESORT_HREFS){
		qsort(Conf->Href,Conf->nhrefs,sizeof(UDM_HREF),cmphrefs);
		/* Remember count of sorted URLs  */
		Conf->shrefs=Conf->nhrefs;
		/* Count of stored URLs became 0  */
		Conf->dhrefs=0;
	}
	return(1);
}
extern __INDLIB__ void UdmFreeHrefs(UDM_ENV * Conf){
	size_t i;
	
	for(i=0;i<Conf->nhrefs;i++){
		free(Conf->Href[i].href);
		UDM_FREE(Conf->Href[i].tag);
		UDM_FREE(Conf->Href[i].cat);
	}
	Conf->nhrefs=0; 
	Conf->mhrefs=0;
	Conf->shrefs=0;
	Conf->dhrefs=0;
	UDM_FREE(Conf->Href);
}
extern __INDLIB__ int UdmStoreHrefs(UDM_AGENT * Indexer){
	size_t i;
	int added=0;

	for(i=Indexer->Conf->dhrefs;i<Indexer->Conf->nhrefs;i++){
		if(!(Indexer->Conf->Href[i].stored)){
			char * msg_id=NULL;
			char * pos;
			
			/* see if it is a NEWS message */
			if((strchr(Indexer->Conf->Href[i].href,'@'))&&(pos = strrchr(Indexer->Conf->Href[i].href,'/'))){
				msg_id = strdup(pos+1);
			}else{
				msg_id = strdup("");
			}
			
			/* now add the url with its message id */
			if(strlen(Indexer->Conf->Href[i].href)<=UDM_URLSIZE){
				UdmAddURL(Indexer,Indexer->Conf->Href[i].href,Indexer->Conf->Href[i].referrer,Indexer->Conf->Href[i].hops,msg_id,Indexer->Conf->Href[i].tag,Indexer->Conf->Href[i].cat);
				if(UdmDBErrorCode(Indexer->db)) {
				        UDM_FREE(msg_id);
					return(added);
				}
			}
			
			/* free msg_id if alloc'd */
			UDM_FREE(msg_id);
			Indexer->Conf->Href[i].stored=1;
			added++;
		}
	}
	/* Remember last stored URL num */
	/* Note that it will became 0   */
	/* after next sort in AddUrl    */
	Indexer->Conf->dhrefs=Indexer->Conf->nhrefs;

	/* We should not free URL list with onw database */
	/* to avoid double indexing of the same document */
	/* So, do it if compiled with SQL only           */

#ifndef HAVE_FILES
	if(Indexer->Conf->nhrefs>MAXHSIZE)
		UdmFreeHrefs(Indexer->Conf);
#endif
	return(added);
}
