# 동적 완벽 해싱

Wed, Feb 3 2021 02:07:25

2021년 3월 20일에 업데이트하였습니다.

해싱 알고리즘에는 여러 알고리즘이 있는데 동적 완벽 해싱(dynamic perfect hashing)에 perfect 라는 단어 때문에 궁극의 해싱인가 싶어서 구현해봤습니다.

https://en.wikipedia.org/wiki/Dynamic_perfect_hashing
논문 제목: Dynamic Perfect Hashing: Upper and Lower Bounds

동적 완벽 해싱이 이론 상 좋아보일 수는 있습니다. 하지만, 동적 완벽 해싱은 데이터를 삽입할 때 충돌이 발생하면 랜덤으로 해시 함수를 준비하여 다시 해싱을 하는 과정을 거칩니다. 이 과정에서 시간이 지연됩니다.

논문에서 h(x) 는 다음과 같습니다. k 의 범위는 1 ≤ kp - 1 입니다.
p 는 prime, s 는 공간 size 입니다.

key 를 주면 그걸 유니버설 해시 함수로 해시값을 만듭니다. `mod` 연산에서 시간 잡아먹고요.

``````h(x) = ((k * x) mod p) mod s
``````

그리고…
알고리즘을 보면 insert 할 때 느린 건 이해는 하겠는데 lookup 할 때도 h(x) 를 두번 거치고, 수많은 버킷 배열과 엔트리 배열 때문에 시간이 지연됩니다. 그래서 해싱과는 별개로 배열 수만 늘려보고 테스트를 해봤는데, 단순히 배열 수만 늘어나도 `array[i]` 의 참조 연산 속도가 떨어지더군요. 이론과 실제는 다르다는 것을 또 다시 느낍니다.

``````i = h(x, random, PRIME, buckets_len)
entries = buckets[i]
j = h(x, random, PRIME, entries_len)
entry = entries[j]
``````

그러면 이 알고리즘은 쓸모가 없는가? 그렇지 않습니다. 기존의 해시 테이블에서 충돌 해결 기법으로 동적 완벽 해싱을 사용하면 매우 좋을 것 같습니다.

다음 소스코드는 제가 논문 뒤져보면서 작성한 코드입니다.
논문에 몇몇 오류가 있는데 가급적 논문에 나온 그대로 작성하였습니다. 논문과 일부 다른 부분도 있습니다.
컴파일 방법은 다음과 같습니다. 최적화 옵션 `-O1` 을 넣지 않으면 너무 느리니 최적화 옵션을 넣어서 컴파일해보시기 바랍니다.

``````cc -O1 -Wall -o tian-dynfect tian-dynfect.c
``````
``````/*
* tian-dynfect.c
* Unauthorized copying of this software, via any medium is strictly prohibited.
* Proprietary and confidential.
* Written by Hodong Kim <hodong@nimfsoft.com> January,Febrary 2021
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>

typedef void (* TianFreeFunc)    (void *data);
typedef bool (* TianEqualFunc)   (const void *a, const void *b);

typedef struct _TianArray TianArray;
struct _TianArray {
void         **data;
unsigned int   len;
};

typedef struct _TianArrayPrivate TianArrayPrivate;
struct _TianArrayPrivate {
void         **data;
unsigned int   len;
TianFreeFunc   free_func;
unsigned int   capacity;
};

TianArray *tian_array_new (TianFreeFunc free_func)
{
TianArrayPrivate *array;

array = malloc (sizeof (TianArrayPrivate));
array->capacity  = 1;
array->data      = malloc (array->capacity * sizeof (void *));
array->len       = 0;
array->free_func = free_func;

return (TianArray *) array;
}

void **tian_array_free (TianArray *array, bool free_data)
{
TianArrayPrivate *priv = (TianArrayPrivate *) array;

if (free_data)
{
if (priv->free_func)
for (int i = 0; i < array->len; i++)
if (array->data[i])
priv->free_func (array->data[i]);

free (array->data);
free (array);
return NULL;
}
else
{
void **data = array->data;
free (array);
return data;
}
}

void tian_array_clear (TianArray *array)
{
TianArrayPrivate *priv = (TianArrayPrivate *) array;

if (priv->free_func)
for (int i = 0; i < array->len; i++)
if (array->data[i])
priv->free_func (array->data[i]);

priv->capacity = 1;
array->data    = realloc (priv->data, priv->capacity * sizeof (void *));
array->len     = 0;
}

void tian_array_add (TianArray *array, void *data)
{
TianArrayPrivate *priv = (TianArrayPrivate *) array;

if (array->len == priv->capacity)
{
priv->capacity = priv->capacity * 2;
array->data = realloc (array->data, sizeof (void *) * priv->capacity);
}

array->data[array->len] = data;
array->len++;
}

/*****************************************************************************/

typedef uint64_t (* TianHashFunc)  (const void *key);

typedef struct _TianEntry TianEntry;
struct _TianEntry {
void *key;
void *value;
};

TianEntry *tian_entry_new (void *key, void *value)
{
TianEntry *entry = malloc (sizeof (TianEntry));
entry->key   = key;
entry->value = value;
return entry;
}

void tian_entry_free (TianEntry *entry,
TianFreeFunc key_free_func,
TianFreeFunc value_free_func)
{
if (key_free_func)
key_free_func (entry->key);

if (value_free_func)
value_free_func (entry->value);

free (entry);
}

/*****************************************************************************/

bool tian_direct_equal (const void *a, const void *b)
{
return a == b;
}

bool tian_str_equal (const void *s1, const void *s2)
{
return strcmp (s1, s2) == 0;
}

/*****************************************************************************/

/* RAND_MAX:      0x7fffffffU  This is a prime number */
/* UINT32_MAX:    0xffffffffU */
#if defined(__FreeBSD__) || defined(__linux__)
#define PRIME     0xfffffffbU /* for arc4random_uniform() or getrandom() */
#else
#define PRIME     0x7fffffffU /* for rand() */
#endif
#define MAX(a, b) (((a) > (b)) ? (a) : (b))

typedef struct _TianHashable TianHashable;
struct _TianHashable {
unsigned random;
unsigned len;
};

typedef unsigned (* TianUHashFunc) (TianHashable *hashable,
const void   *key);

typedef struct _TianBucket TianBucket;
struct _TianBucket {
unsigned    random;      /* 1 <= k <= prime - 1 */
unsigned    entries_len; /* entries_len = 2 * m * (m - 1) */
unsigned    m;           /* entries_len = 2 * m * (m - 1) */
unsigned    b;           /* rehash: b = lists[i]->len, insert: b = b + 1 */
TianEntry **entries;
};

/* constant */
#define DYNFECT_C  1 /* m = (1 + c) * MAX (count, 4) in rehash() */
#define DYNFECT_S  1 /* buckets_len = S(M) = DYNFECT_S * dynfect->m */

typedef struct _TianDynfect TianDynfect;
struct _TianDynfect {
unsigned      random;      /* 1 <= k <= prime - 1 */
unsigned      buckets_len; /* buckets_len = S(M) = DYNFECT_S * dynfect->m */
TianUHashFunc hash_func;
TianEqualFunc key_equal_func;
TianFreeFunc  key_free_func;
TianFreeFunc  value_free_func;
TianBucket   *buckets;
unsigned      op_count; /* count++ when add, remove */
unsigned      m;        /* m = (1 + c) * MAX (count, 4) in rehash() */
unsigned      n_items;
};

/* universal hash for integer */
static unsigned tian_hashable_uhash_int (TianHashable *hashable,
const void   *key)
{
/* h(x) = (kx mod p) mod s; 1 <= k <= prime - 1 */
/* prime >= hashable->len */
return (((unsigned long) hashable->random * (unsigned long) key) %
(unsigned long) PRIME) % (unsigned long) hashable->len;
}

static unsigned tian_dynfect_gen_random ()
{
#if !defined(__FreeBSD__) && !defined(__linux__)
#if RAND_MAX != 0x7fffffffU
#error "Debug this source code"
#endif
static long seed = -1;

if (seed < 0)
{
/* This is insecure */
seed = time (NULL);
srand (seed);
}
#endif
/* h(x) = (kx mod p) mod s; 1 <= k <= prime - 1 */
#ifdef __FreeBSD__
return arc4random_uniform (PRIME - 1) + 1; /* uint32_t */
#elif __linux__
uint32_t buf;
(void) getrandom (&buf, sizeof (unsigned), 0);
return buf % (PRIME - 1) + 1;
#else
return rand() % (PRIME - 1) + 1; /* int */
#endif
}

static bool tian_dynfect_condition (TianDynfect *dynfect)
{
unsigned sum = 0;
/* 32 * M * M / S(M) + 4 * M */
/* S(M) is buckets_len */
/* buckets_len is DYNFECT_S * dynfect->m */
unsigned bound = 32 * dynfect->m / DYNFECT_S + 4 * dynfect->m;

for (unsigned i = 0; i < dynfect->buckets_len; i++)
{
sum = sum + dynfect->buckets[i].m;

if (sum > bound)
return false;
}

return true;
}

static bool tian_dynfect_inject (TianDynfect *dynfect,
TianArray   *list,
TianBucket  *bucket)
{
TianEntry *entry;
unsigned   index;

bucket->random = tian_dynfect_gen_random ();

for (unsigned i = 0; i < list->len; i++)
{
entry = list->data[i];
index = dynfect->hash_func ((TianHashable *) bucket, entry->key);

if (bucket->entries[index])
return false;

bucket->entries[index] = entry;
}

return true;
}

static bool tian_dynfect_contains (TianDynfect *dynfect,
const void  *key)
{
unsigned i = dynfect->hash_func ((TianHashable *) dynfect, key);
TianBucket *bucket = &dynfect->buckets[i];

if (bucket->entries_len > 0)
{
int pos = dynfect->hash_func ((TianHashable *) bucket, key);
TianEntry *entry = bucket->entries[pos];

if (entry && dynfect->key_equal_func (entry->key, key))
return true;
}

return false;
}

static void tian_dynfect_rehash (TianDynfect *dynfect, TianEntry *new_entry)
{
TianArray  *list;
TianArray **lists;

list = tian_array_new (NULL);

for (unsigned i = 0; i < dynfect->buckets_len; i++)
{
TianBucket *bucket = &dynfect->buckets[i];
for (unsigned j = 0; j < bucket->entries_len; j++)
if (bucket->entries[j])

free (bucket->entries);
}

if (new_entry)
{
dynfect->n_items++;
}

dynfect->op_count = list->len;
dynfect->m = (1 + DYNFECT_C) * MAX (dynfect->op_count, 4);
dynfect->buckets_len = DYNFECT_S * dynfect->m;

dynfect->buckets = realloc (dynfect->buckets,
dynfect->buckets_len * sizeof (TianBucket));
memset (dynfect->buckets, 0, dynfect->buckets_len * sizeof (TianBucket));
lists = calloc (dynfect->buckets_len, sizeof (TianArray *));

do {
dynfect->random = tian_dynfect_gen_random ();

for (unsigned i = 0; i < dynfect->buckets_len; i++)
if (lists[i] && lists[i]->len)
tian_array_clear (lists[i]);

for (unsigned i = 0; i < list->len; i++)
{
TianEntry *entry = list->data[i];
unsigned pos = dynfect->hash_func ((TianHashable *) dynfect, entry->key);

if (!lists[pos])
lists[pos] = tian_array_new (NULL);

}

for (unsigned i = 0; i < dynfect->buckets_len; i++)
{
TianBucket *bucket = &dynfect->buckets[i];
memset (bucket, 0, sizeof (TianBucket));
bucket->b = lists[i] ? lists[i]->len : 0;
bucket->m = 2 * bucket->b;
bucket->entries_len = 2 * bucket->m * (bucket->m - 1);
}
} while (!tian_dynfect_condition (dynfect));

tian_array_free (list, true);

for (unsigned i = 0; i < dynfect->buckets_len; i++)
{
TianBucket *bucket = &dynfect->buckets[i];

if (lists[i] && (lists[i]->len > 0))
{
bucket->entries = calloc (bucket->entries_len, sizeof (TianEntry *));

do {
memset (bucket->entries, 0, bucket->entries_len * sizeof (TianEntry *));
} while (!tian_dynfect_inject (dynfect, lists[i], bucket));
}

if (lists[i])
tian_array_free (lists[i], true);
}

free (lists);
}

static TianDynfect *tian_dynfect_new (TianUHashFunc hash_func,
TianEqualFunc key_equal_func,
TianFreeFunc  key_free_func,
TianFreeFunc  value_free_func)
{
TianDynfect *dynfect = malloc (sizeof (TianDynfect));

dynfect->hash_func       = hash_func;
dynfect->key_equal_func  = key_equal_func;
dynfect->key_free_func   = key_free_func;
dynfect->value_free_func = value_free_func;
dynfect->op_count        = 0;
dynfect->buckets_len     = 0;
dynfect->buckets         = NULL;
dynfect->n_items         = 0;
dynfect->m               = 0;
tian_dynfect_rehash (dynfect, NULL);

return dynfect;
}

static bool tian_dynfect_insert (TianDynfect *dynfect,
void        *key,
void        *value)
{
unsigned    index = dynfect->hash_func ((TianHashable *) dynfect, key);
unsigned    pos;
TianBucket *bucket = &dynfect->buckets[index];

if (bucket->entries_len > 0)
{
pos = dynfect->hash_func ((TianHashable *) bucket, key);
TianEntry *entry = bucket->entries[pos];

if (entry)
{
if (dynfect->key_equal_func (entry->key, key))
{
if (dynfect->key_free_func)
dynfect->key_free_func (entry->key);

if (dynfect->value_free_func)
dynfect->value_free_func (entry->value);

entry->key   = key;
entry->value = value;

return false;
}
}
}

dynfect->op_count++;

if (dynfect->op_count > dynfect->m)
{
tian_dynfect_rehash (dynfect, tian_entry_new (key, value));

return true;
}

bucket->b++;

if (bucket->b <= bucket->m)
{
if (!bucket->entries[pos])
{
bucket->entries[pos] = tian_entry_new (key, value);
}
else
{
TianArray *list;

list = tian_array_new (NULL);

for (unsigned i = 0; i < bucket->entries_len; i++)
if (bucket->entries[i])

do {
if (bucket->entries == NULL)
bucket->entries = calloc (bucket->entries_len, sizeof (TianEntry *));
else
memset (bucket->entries,
0, bucket->entries_len * sizeof (TianEntry *));
} while (!tian_dynfect_inject (dynfect, list, bucket));

tian_array_free (list, true);
}
}
else
{
if (tian_dynfect_condition (dynfect))
{
TianArray *list;

list = tian_array_new (NULL);

if (bucket->entries)
for (unsigned j = 0; j < bucket->entries_len; j++)
if (bucket->entries[j])

bucket->m = 2 * MAX (bucket->m, 1);
bucket->entries_len = 2 * (bucket->m) * (bucket->m - 1);
bucket->entries = realloc (bucket->entries,
bucket->entries_len * sizeof (TianEntry *));
do {
memset (bucket->entries, 0, bucket->entries_len * sizeof (TianEntry *));
} while (!tian_dynfect_inject (dynfect, list, bucket));

tian_array_free (list, true);
}
else
{
tian_dynfect_rehash (dynfect, tian_entry_new (key, value));
}
}

dynfect->n_items++;

return true;
}

static bool tian_dynfect_remove (TianDynfect *dynfect, const void *key)
{
dynfect->op_count++;

unsigned i = dynfect->hash_func ((TianHashable *) dynfect, key);
TianBucket *bucket = &dynfect->buckets[i];

if (bucket->entries_len < 1)
return false;

int pos = dynfect->hash_func ((TianHashable *) bucket, key);
TianEntry *entry = bucket->entries[pos];

if (entry == NULL)
return false;

tian_entry_free (entry,
dynfect->key_free_func,
dynfect->value_free_func);
bucket->entries[pos] = NULL;
dynfect->n_items--;

if (dynfect->op_count >= dynfect->m)
tian_dynfect_rehash (dynfect, NULL);

return true;
}

static void *tian_dynfect_lookup (TianDynfect *dynfect,
const void  *key)
{
unsigned i = dynfect->hash_func ((TianHashable *) dynfect, key);
TianBucket *bucket = &dynfect->buckets[i];

if (bucket->entries_len > 0)
{
int pos = dynfect->hash_func ((TianHashable *) bucket, key);
TianEntry *entry = bucket->entries[pos];

if (entry && dynfect->key_equal_func (entry->key, key))
return entry->value;
}

return NULL;
}

static void tian_dynfect_free (TianDynfect *dynfect)
{
for (unsigned i = 0; i < dynfect->buckets_len; i++)
{
TianBucket *bucket = &dynfect->buckets[i];

for (unsigned j = 0; j < bucket->entries_len; j++)
{
TianEntry *entry = bucket->entries[j];

if (entry)
tian_entry_free (entry,
dynfect->key_free_func,
dynfect->value_free_func);
}

free (bucket->entries);
}

free (dynfect->buckets);
free (dynfect);
}

unsigned tian_dynfect_size (TianDynfect *dynfect)
{
return dynfect->n_items;
}

/*****************************************************************************/

int main (int argc, char **argv)
{
puts ("====== start of dynfect test");

struct timespec ts1, ts2;
TianDynfect *dynfect;

dynfect = tian_dynfect_new (tian_hashable_uhash_int, tian_direct_equal, NULL, NULL);

/* replace test */
clock_gettime (CLOCK_MONOTONIC, &ts1);

for (uint32_t i = 0; i < 1000; i++)
tian_dynfect_insert (dynfect, (void *) (uint64_t) 10,
(void *) (uint64_t) i);

clock_gettime (CLOCK_MONOTONIC, &ts2);

printf ("tian: added 1000 times: %ld ms\n",
((ts2.tv_sec * 1000000000 + ts2.tv_nsec) - (ts1.tv_sec * 1000000000 + ts1.tv_nsec)) / 1000000);
printf ("tian_hash_table_size (dynfect): %u\n", tian_dynfect_size (dynfect));
assert (tian_dynfect_size (dynfect) == 1);

for (uint32_t i = 0; i < 1000; i++)
{
uint32_t retval;
retval = (uint32_t) (uint64_t) tian_dynfect_lookup (dynfect, (void *) (uint64_t) 10);
assert (retval == 999);
assert (tian_dynfect_contains (dynfect, (void *) (uint64_t) 10));
}

assert (tian_dynfect_remove (dynfect, (void *) 10));

clock_gettime (CLOCK_MONOTONIC, &ts1);

for (uint32_t i = 0; i < 10000; i++)
tian_dynfect_insert (dynfect, (void *) (uint64_t) i,
(void *) (uint64_t) i);

clock_gettime (CLOCK_MONOTONIC, &ts2);

printf ("tian: added 10000 times: %ld ms\n",
((ts2.tv_sec * 1000000000 + ts2.tv_nsec) - (ts1.tv_sec * 1000000000 + ts1.tv_nsec)) / 1000000);
printf ("tian_hash_table_size (dynfect): %u\n", tian_dynfect_size (dynfect));

clock_gettime (CLOCK_MONOTONIC, &ts1);

for (uint32_t i = 0; i < 10000; i++)
{
uint32_t retval;
retval = (uint32_t) (uint64_t) tian_dynfect_lookup (dynfect, (void *) (uint64_t) i);
assert (i == retval);
}

printf ("tian: lookup 10000 times: %ld ms\n",
((ts2.tv_sec * 1000000000 + ts2.tv_nsec) - (ts1.tv_sec * 1000000000 + ts1.tv_nsec)) / 1000000);
printf ("tian_hash_table_size (dynfect): %u\n", tian_dynfect_size (dynfect));

tian_dynfect_free (dynfect);

return 0;
}
``````