From ff5da06da823ac3a0bc7e89b35ed573255139b2d Mon Sep 17 00:00:00 2001
From: Soikk <76824648+Soikk@users.noreply.github.com>
Date: Sun, 24 Jul 2022 22:18:31 +0200
Subject: [PATCH] Added ref count for files & tags. Refactored database.c.
 Removed tags.h but kept documentation.

---
 DOC                |  10 ++-
 TODO               |   2 +
 include/database.h |   9 ++-
 include/storage.h  |  11 ++++
 include/str.h      |   1 -
 include/tags.h     |  22 -------
 src/bm.c           |   2 +-
 src/database.c     | 152 +++++++++++++++++++++++++++++----------------
 src/main.c         |  22 ++++---
 9 files changed, 140 insertions(+), 91 deletions(-)
 delete mode 100644 include/tags.h

diff --git a/DOC b/DOC
index d5ef900..6f1f195 100644
--- a/DOC
+++ b/DOC
@@ -73,18 +73,22 @@ DATABASE
 	A database consists of a 32 character (including trailing 0) name, and:
 		- 2 lookup tables (ltable) for storing the unique file and tag names.
 		- 2 hash tables (htable) for storing the hashes of the names in the ltables.
+		- 2 hash tables (htable) for storing the count of the files & tags in the mapping table (how many files one tags has and vice versa).
 		- 1 mapping tables (mtable) for storing the mappings of the tags to the files.
 	The lookup tables serve the purpose of looking up the names of the files and tags when needed.
-	The hash tables serve the purpose of providing faster search times when searching for a file or tag.
-	Each respective lookup and hash tables (lfiles and hfiles, ltags and htags) share indexes.
+	The first 2 hash tables serve the purpose of providing faster search times when searching for a file or tag.
+	The remaining 2 hash tables (fcount and tcount) serve the purpose of storing the count of how many of each file and tag is in the mapping table.
+	Each respective lookup and hash tables (lfiles, hfiles and fcount, ltags, htags and hcount) share indexes.
 	The mapping table serves the purpose of storing the relation between different files and tags as the pairing of their indexes.
-	A databbase is written to disk in the following format:
+	A database is written to disk in the following format:
 		- 2 bytes as a header that store the 'DB' ASCII characters.
 		- 32 bytes that store the name of the database.
 		- The lfiles ltable.
 		- The ltags ltable.
 		- The hfiles htable.
 		- The htags htable.
+		- The fcount htable;
+		- The tcount htable;
 		- The map mtable.
 		- 3 bytes as "end" that store the 'END'"' ASCII characters.
 	When loading a database, if the header doesn't match, it will print "Header is '(header)' not 'DB'" to standard error.
diff --git a/TODO b/TODO
index 309e76d..e934776 100644
--- a/TODO
+++ b/TODO
@@ -1,3 +1,5 @@
+TODO	Get rid of old functionalities (strnatcmp, BM)
+
 ----------------------------------------------------------------
 DONE	Change DB model from struct row
 			typedef struct{
diff --git a/include/database.h b/include/database.h
index 2b1c91a..339b48b 100644
--- a/include/database.h
+++ b/include/database.h
@@ -8,6 +8,7 @@ typedef struct database{
 	char name[32];
 	ltable *lfiles, *ltags;
 	htable *hfiles, *htags;
+	htable *fcount, *tcount;
 	mtable *map;
 } database;
 
@@ -18,11 +19,17 @@ database *loadDatabase(const char* path);
 
 int storeDatabase(database *db, const char *path);
 
+uint64_t addFile(database *db, char *file);
+
+uint64_t addTag(database *db, char *tag);
+
 int addFileTag(database *db, char *file, char *tag);
 
 int addFileTags(database *db, char *file, int ntags, ...);
 
-int searchTag(database *db, char *tag, uint64_t *rl);
+int searchTag(database *db, char *tag, uint64_t n, uint64_t **r, uint64_t *rl);
+
+int searchFile(database *db, char *file, uint64_t n, uint64_t **r, uint64_t *rl);
 
 void printDatabase(database *db);
 
diff --git a/include/storage.h b/include/storage.h
index 82ea55a..bef740a 100644
--- a/include/storage.h
+++ b/include/storage.h
@@ -3,6 +3,17 @@
 
 #include "db.h"
 
+/*	(From tags.h)
+	tags are stored in a big table (or their hashes are) ordered
+	by alphabetical order
+	tags can have namespaces which are a special tag that starts
+	with a ':'
+	all tags in a namespace are located between two occurrences
+	of the namespace within the list,
+		e.g [":people", "sam hyde", "hitler", ":people"]
+	maybe namespaces use another hashing function to prevent
+	collisions because of the lack of space because of the ':'
+*/
 
 /*
 	tags are stored in a big table (or their hashes are) ordered
diff --git a/include/str.h b/include/str.h
index de119f8..999f1e3 100644
--- a/include/str.h
+++ b/include/str.h
@@ -14,5 +14,4 @@ char *normalizeStrLimit(const char *str, uint32_t *l, uint32_t limit);
 
 ssize_t strInTags(const char *tags, int n, const char *ndl, int m, char sep); 
 
-
 #endif
diff --git a/include/tags.h b/include/tags.h
deleted file mode 100644
index 0b7f5a9..0000000
--- a/include/tags.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef TAGS_H
-#define TAGS_H
-
-/*
-	tags are stored in a big table (or their hashes are) ordered
-	by alphabetical order
-	tags can have namespaces which are a special tag that starts
-	with a ':'
-	all tags in a namespace are located between two occurrences
-	of the namespace within the list,
-		e.g [":people", "sam hyde", "hitler", ":people"]
-	maybe namespaces use another hashing function to prevent
-	collisions because of the lack of space because of the ':'
-*/
-
-#define MAXTAGS 4094
-
-void insertTag(char **tags, char *tag){
-	if()
-}
-
-#endif
\ No newline at end of file
diff --git a/src/bm.c b/src/bm.c
index d94765e..d2afb0a 100644
--- a/src/bm.c
+++ b/src/bm.c
@@ -41,4 +41,4 @@ ssize_t BM(char *x, int m, char *y, int n){
 		j += shift;                          /* shift */
 	}
 	return -1;
-}
\ No newline at end of file
+}
diff --git a/src/database.c b/src/database.c
index db7759e..f034464 100644
--- a/src/database.c
+++ b/src/database.c
@@ -8,6 +8,8 @@ database *newDatabase(char *name){
 	db->ltags = newLtable(0);
 	db->hfiles = newHtable(0);
 	db->htags = newHtable(0);
+	db->fcount = newHtable(0);
+	db->tcount = newHtable(0);
 	db->map = newMtable(0);
 	return db;
 }
@@ -19,6 +21,7 @@ database *loadDatabase(const char* path){
 	if(!sameStr(header, "DB")){
 		fprintf(stderr, "Header is '%s' not 'DB'\n", header);
 	}
+	
 	char name[32];
 	fread(&name, sizeof(char), 32, fp);
 	database *db = newDatabase(name);
@@ -26,9 +29,12 @@ database *loadDatabase(const char* path){
 	db->ltags = loadLtable(fp);
 	db->hfiles = loadHtable(fp);
 	db->htags = loadHtable(fp);
+	db->fcount = loadHtable(fp);
+	db->tcount = loadHtable(fp);
 	db->map = loadMtable(fp);
-	char end[4];
-	fread(&end, sizeof(char), 3, fp);
+	
+	char *end = calloc(3, sizeof(char));
+	fread(end, sizeof(char), 3, fp);
 	if(!sameStr(end, "END")){
 		fprintf(stderr, "End is '%s' not 'END'\n", end);
 	}
@@ -47,6 +53,8 @@ int storeDatabase(database *db, const char *path){
 	storeLtable(db->ltags, fp);
 	storeHtable(db->hfiles, fp);
 	storeHtable(db->htags, fp);
+	storeHtable(db->fcount, fp);
+	storeHtable(db->tcount, fp);
 	storeMtable(db->map, fp);
 	
 	char end[3] = "END";
@@ -56,6 +64,44 @@ int storeDatabase(database *db, const char *path){
 	return 0;
 }
 
+static void increaseCount(htable *ht, uint64_t i){
+	ht->table[i]++;
+}
+
+uint64_t addFile(database *db, char *file){
+	uint32_t l;
+	file = normalizeStrLimit(file, &l, MAXPATH-1);
+	uint64_t h = crc64(0, file, l);
+	uint64_t i = htableSearch(db->hfiles, h);
+	
+	if(i == -1){
+		ltableAdd(db->lfiles, file);
+		htableAdd(db->hfiles, h);
+		htableAdd(db->fcount, 0);
+		i = db->hfiles->size-1;
+	}
+	increaseCount(db->fcount, i);
+	
+	return i;
+}
+
+uint64_t addTag(database *db, char *tag){
+	uint32_t l;
+	tag = normalizeStrLimit(tag, &l, MAXPATH-1);
+	uint64_t h = crc64(0, tag, l);
+	uint64_t i = htableSearch(db->htags, h);
+	
+	if(i == -1){
+		ltableAdd(db->ltags, tag);
+		htableAdd(db->htags, h);
+		htableAdd(db->tcount, 0);
+		i = db->htags->size-1;
+	}
+	increaseCount(db->tcount, i);
+	
+	return i;
+}
+
 static int addRelation(database *db, relation r){
 	if(mtableSearch(db->map, r) != -1){
 		return -1;
@@ -66,83 +112,81 @@ static int addRelation(database *db, relation r){
 }
 
 int addFileTag(database *db, char *file, char *tag){
-	uint32_t lf, lt;
-	file = normalizeStrLimit(file, &lf, MAXPATH-1);
-	tag = normalizeStrLimit(tag, &lt, MAXPATH-1);
-	uint64_t hf = crc64(0, file, lf), ht = crc64(0, tag, lt);
-	uint64_t fi = htableSearch(db->hfiles, hf), ti = htableSearch(db->htags, ht);
-	
-	if(fi == -1){
-		ltableAdd(db->lfiles, file);
-		htableAdd(db->hfiles, hf);
-		fi = db->hfiles->size-1;
-	}
-	if(ti == -1){
-		ltableAdd(db->ltags, tag);
-		htableAdd(db->htags, ht);
-		ti = db->htags->size-1;
-	}
-	
+	uint64_t fi = addFile(db, file), ti = addTag(db, tag);
 	addRelation(db, (relation){.file = fi, .tag = ti});
+	
 	return 0;
 }
 
 int addFileTags(database *db, char *file, int ntags, ...){
-	uint32_t lf;
-	file = normalizeStrLimit(file, &lf, MAXPATH-1);
-	uint64_t hf = crc64(0, file, lf);
-	uint64_t fi = htableSearch(db->hfiles, hf);
-	
-	if(fi == -1){
-		ltableAdd(db->lfiles, file);
-		htableAdd(db->hfiles, hf);
-		fi = db->hfiles->size-1;
-	}
-	
 	va_list tags;
 	va_start(tags, ntags);
 	for(uint64_t i = 0; i < ntags; ++i){
 		char *tag = va_arg(tags, char*);
-		uint32_t lt;
-		tag = normalizeStrLimit(tag, &lt, MAXPATH-1);
-		uint64_t ht = crc64(0, tag, lt);
-		uint64_t ti = htableSearch(db->htags, ht);
-		
-		if(ti == -1){
-			ltableAdd(db->ltags, tag);
-			htableAdd(db->htags, ht);
-			ti = db->htags->size-1;
-		}
-		
-		addRelation(db, (relation){.file = fi, .tag = ti});
+		addFileTag(db, file, tag);
 	}
 	va_end(tags);
 
 	return 0;
 }
 
-// Should return a list with the indexes of the files that have this tag
-int searchTag(database *db, char *tag, uint64_t *rl){
+// Stores in r a list with the indexes of the first n files that have this tag
+// If n is 0 or lower, it returns all of them. Stores in rl the length of r
+int searchTag(database *db, char *tag, uint64_t n, uint64_t **r, uint64_t *rl){
 	uint32_t l;
 	tag = normalizeStrLimit(tag, &l, MAXPATH-1);
 	uint64_t h = crc64(0, tag, l);
 	uint64_t ti = htableSearch(db->htags, h);
-	// TODO: error checking
+	if(ti == -1){
+		return -1;
+	}
 	
-	uint64_t c = 0;
+	*rl = 0;
 	for(uint64_t i = 0; i < db->map->size; ++i){
+		if(n < 1 || *rl < n){
+			if(db->map->table[i].tag == ti){
+				++(*rl);
+			}
+		}
+	}
+	*r = malloc((*rl)*sizeof(uint64_t));
+	uint64_t c = 0;
+	for(uint64_t i = 0; i < db->map->size && c < *rl; ++i){
 		if(db->map->table[i].tag == ti){
-			++c;
+			(*r)[c++] = db->map->table[i].file;
 		}
 	}
-	uint64_t *r = malloc(c*sizeof(uint64_t));
-	c = 0;
+	
+	return 0;
+}
+
+// Stores in r a list with the indexes of the first n tags that this file has
+// If n is 0 or lower, it returns all of them. Stores in rl the length of r
+int searchFile(database *db, char *file, uint64_t n, uint64_t **r, uint64_t *rl){
+	uint32_t l;
+	file = normalizeStrLimit(file, &l, MAXPATH-1);
+	uint64_t h = crc64(0, file, l);
+	uint64_t fi = htableSearch(db->hfiles, h);
+	if(fi == -1){
+		return -1;
+	}
+	
+	*rl = 0;
 	for(uint64_t i = 0; i < db->map->size; ++i){
-		if(db->map->table[i].tag == ti){
-			r[c++] = db->map->table[i].file;
+		if(n < 1 || *rl < n){
+			if(db->map->table[i].file == fi){
+				++(*rl);
+			}
 		}
 	}
-	rl = r;
+	*r = malloc((*rl)*sizeof(uint64_t));
+	uint64_t c = 0;
+	for(uint64_t i = 0; i < db->map->size && c < *rl; ++i){
+		if(db->map->table[i].file == fi){
+			(*r)[c++] = db->map->table[i].tag;
+		}
+	}
+	
 	return 0;
 }
 
@@ -158,11 +202,11 @@ void debugDatabase(database *db){
 	printf("Name: %s\n", db->name);
 	printf("\t-lfiles: %d\n", db->lfiles->size);
 	for(uint64_t i = 0; i < db->lfiles->size; ++i){
-		printf("\t\t+%s\n", db->lfiles->table[i]);
+		printf("\t\t+%s (%" PRIu64 ")\n", db->lfiles->table[i], db->fcount->table[i]);
 	}
 	printf("\t-ltags: %d\n", db->ltags->size);
 	for(uint64_t i = 0; i < db->ltags->size; ++i){
-		printf("\t\t+%s\n", db->ltags->table[i]);
+		printf("\t\t+%s (%" PRIu64 ")\n", db->ltags->table[i], db->tcount->table[i]);
 	}
 	printf("\t-hfiles: %d\n", db->hfiles->size);
 	for(uint64_t i = 0; i < db->hfiles->size; ++i){
diff --git a/src/main.c b/src/main.c
index 01fdad2..bd5c6b9 100644
--- a/src/main.c
+++ b/src/main.c
@@ -4,26 +4,30 @@
 
 int main(){
 
-	printf("%016llu\n", (uint64_t) crc64(0, (unsigned char*)"cacadevaca", 10));
-
+	database *db = newDatabase("miDB");
 	inputBuffer *in = newInputBuffer();
 	
-	char *str = "grandmother;football;capital;concerned;entire;realize;garden;refused;proud;tune;rhyme;other;writer;command;fresh;fence;rapidly;active;cover;repeat;determine;yard;cannot;animal;pure;rich;mirror;frozen;vast;coach;brass;activity;bottom;airplane;local;tone;attack;though;between;value;collect;mission;tower;brought;original;history;reason;minute;would;hung;strange;children;offer;blue;wrapped;magnet;color;cage;easily;percent;lower;verb;hundred;larger;away;was;certain;western;yes;lack;wish;same;spend;arrive;fog;heard;bill;effort;steam;wolf;indicate;suppose;because;life;down;seat;age;earn;under;cell;floating;although;spent;folks;swing;hello;cent;swung;pen;happened;slip;pupil;smell;fix;piano;closer;idea;trunk;model;school;particularly;he;coast;describe;such;join;been;hard;three;around;tube;soldier;baby;mouse;note;sort;house;gasoline;organized;eat;sat;crowd;alive;spoken;wide;square;luck;tales;angry;having;wear;frog;outer;nice;regular;year;clothing;check;throughout;farmer;dug;dark;exercise;table;your;form;should;personal;use;road;bright;walk;fairly;affect;but;night;close;job;front;fight;beside;ocean;herd;pass;hardly;widely;prepare;nails;paid;lucky;design;grandfather;aid;heavy;truck;sleep;difficult;log;keep;government;headed;mother;sad;bread;voyage;when;happy;making;whistle;plural;guard;therefore;continent;roof;money;pan;unusual;region;special;generally;plate;visit;look;lost;sick;wonderful;farther;put;characteristic;gravity;trap;system;twice;taste;knew;mad;smallest;automobile;return;huge;underline;danger;news;electric;information;breeze;thread;equally;five;new;average;former;wild;spend;cabin;recognize;nearest;circle;such;found;pass;whistle;slave;event;knowledge;fear;friend;am;browserling;cry;length;thy;create;busy;office;earth;blind;smallest;birthday;putting;classroom;pen;southern;summer;put;open;solution;spread;equator;else;kitchen;determine;strong;change;world;pocket;claws;earn;excellent;drove;donkey;rush;band;energy;fighting;hurt;ordinary;native;visitor;give;storm;pressure;imagine;street;engine;worth;hospital;attached;subject;perhaps;hospital;living;waste;dark;natural;change;enter;girl;motor;element;experiment;physical;value;excited;fort;layers;buy;minerals;satisfied;next;spirit;unhappy;storm;angry;science;desk;develop;behind;afraid;act;else;prepare;given;raw;affect;husband;ring;older;brought;book;cow;lake;sides;ago;fill;successful;real;aside;taught;mind;straight;date;very;chart;slabs;thin;saddle;full;sort;heard;surprise;fox;cool;dish;alphabet;early;spring;nest;sometime;date;light;break;lion;difference;rhyme;might;step;teach;potatoes;young;nine;liquid;how;lunch;heavy;mass;being;save;cutting;negative;swimming;cutting;journey;army;none;worry;leave;explore;baseball;fight;road;exact;hay;voyage;sheet;test;right;examine;agree;heart;pig;cannot;tool;hill;changing;bee;find;together;lay;tie;lost;continued;then;came;rhyme;mirror;town;substance;both;up;quite;push;shake;solid;result;you;ought;chicken;waste;freedom;why;somehow;not;complete;sick;struggle;military;pure;top;south;step;education;could;between;familiar;recognize;rich;tool;material;were;chicken;stopped;stay;policeman;round;firm";
-
-
-	database *db = newDatabase("miDB");
 	
 	addFileTag(db, "vaca.png", "naturaleza");	
 	addFileTags(db, "donald-tromp.jpg", 3, "based", "hitler", "very cool");
+	addFileTag(db, "vaca.png", "lovely");
+	addFileTags(db, "vaca.png", 3, "nature", "kami", "very cool");
 	
 	storeDatabase(db, "db.db");
-	
-	db = loadDatabase("db.db");
 
 	printDatabase(db);
 	
 	debugDatabase(db);
-
+	
+	uint64_t *l, i;
+	searchFile(db, "donald-tromp.jpg", 0, &l, &i);
+	
+	printf("Tags with tag 'donald-tromp.jpg':\n");
+	for(uint64_t j = 0; j < i; ++j){
+		printf("\t%s\n", db->ltags->table[l[j]]);
+	
+	}
+	
 
 	while(0){
 
-- 
2.47.3