동적 완벽 해싱

Wed, Feb 3 2021 02:07:25

2021년 3월 20일에 업데이트하였습니다.

해싱 알고리즘에는 여러 알고리즘이 있는데 동적 완벽 해싱(dynamic perfect hashing)에 perfect 라는 단어 때문에 궁극의 해싱인가 싶어서 구현해봤습니다.

https://en.wikipedia.org/wiki/Dynamic_perfect_hashing
논문 제목: Dynamic Perfect Hashing: Upper and Lower Bounds

동적 완벽 해싱이 이론 상 좋아보일 수는 있습니다. 하지만, 동적 완벽 해싱은 데이터를 삽입할 때 충돌이 발생하면 랜덤으로 해시 함수를 준비하여 다시 해싱을 하는 과정을 거칩니다. 이 과정에서 시간이 지연됩니다.

논문에서 h(x) 는 다음과 같습니다. k 의 범위는 1 ≤ kp - 1 입니다.
p 는 prime, s 는 공간 size 입니다.

key 를 주면 그걸 유니버설 해시 함수로 해시값을 만듭니다. mod 연산에서 시간 잡아먹고요.

h(x) = ((k * x) mod p) mod s

그리고…
알고리즘을 보면 insert 할 때 느린 건 이해는 하겠는데 lookup 할 때도 h(x) 를 두번 거치고, 수많은 버킷 배열과 엔트리 배열 때문에 시간이 지연됩니다. 그래서 해싱과는 별개로 배열 수만 늘려보고 테스트를 해봤는데, 단순히 배열 수만 늘어나도 array[i] 의 참조 연산 속도가 떨어지더군요. 이론과 실제는 다르다는 것을 또 다시 느낍니다.

i = h(x, random, PRIME, buckets_len)
entries = buckets[i]
j = h(x, random, PRIME, entries_len)
entry = entries[j]

그러면 이 알고리즘은 쓸모가 없는가? 그렇지 않습니다. 기존의 해시 테이블에서 충돌 해결 기법으로 동적 완벽 해싱을 사용하면 매우 좋을 것 같습니다.

다음 소스코드는 제가 논문 뒤져보면서 작성한 코드입니다.
논문에 몇몇 오류가 있는데 가급적 논문에 나온 그대로 작성하였습니다. 논문과 일부 다른 부분도 있습니다.
컴파일 방법은 다음과 같습니다. 최적화 옵션 -O1 을 넣지 않으면 너무 느리니 최적화 옵션을 넣어서 컴파일해보시기 바랍니다.

cc -O1 -Wall -o tian-dynfect tian-dynfect.c
/*
 * tian-dynfect.c
 * Copyright (C) 2021 Hodong Kim, All rights reserved.
 * Unauthorized copying of this software, via any medium is strictly prohibited.
 * Proprietary and confidential.
 * Written by Hodong Kim <hodong@nimfsoft.com> January,Febrary 2021
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>

typedef void (* TianFreeFunc)    (void *data);
typedef bool (* TianEqualFunc)   (const void *a, const void *b);

typedef struct _TianArray TianArray;
struct _TianArray {
  void         **data;
  unsigned int   len;
};

typedef struct _TianArrayPrivate TianArrayPrivate;
struct _TianArrayPrivate {
  void         **data;
  unsigned int   len;
  TianFreeFunc   free_func;
  unsigned int   capacity;
};

TianArray *tian_array_new (TianFreeFunc free_func)
{
  TianArrayPrivate *array;

  array = malloc (sizeof (TianArrayPrivate));
  array->capacity  = 1;
  array->data      = malloc (array->capacity * sizeof (void *));
  array->len       = 0;
  array->free_func = free_func;

  return (TianArray *) array;
}

void **tian_array_free (TianArray *array, bool free_data)
{
  TianArrayPrivate *priv = (TianArrayPrivate *) array;

  if (free_data)
  {
    if (priv->free_func)
      for (int i = 0; i < array->len; i++)
        if (array->data[i])
          priv->free_func (array->data[i]);

    free (array->data);
    free (array);
    return NULL;
  }
  else
  {
    void **data = array->data;
    free (array);
    return data;
  }
}

void tian_array_clear (TianArray *array)
{
  TianArrayPrivate *priv = (TianArrayPrivate *) array;

  if (priv->free_func)
    for (int i = 0; i < array->len; i++)
      if (array->data[i])
        priv->free_func (array->data[i]);

  priv->capacity = 1;
  array->data    = realloc (priv->data, priv->capacity * sizeof (void *));
  array->len     = 0;
}

void tian_array_add (TianArray *array, void *data)
{
  TianArrayPrivate *priv = (TianArrayPrivate *) array;

  if (array->len == priv->capacity)
  {
    priv->capacity = priv->capacity * 2;
    array->data = realloc (array->data, sizeof (void *) * priv->capacity);
  }

  array->data[array->len] = data;
  array->len++;
}

/*****************************************************************************/

typedef uint64_t (* TianHashFunc)  (const void *key);

typedef struct _TianEntry TianEntry;
struct _TianEntry {
  void *key;
  void *value;
};

TianEntry *tian_entry_new (void *key, void *value)
{
  TianEntry *entry = malloc (sizeof (TianEntry));
  entry->key   = key;
  entry->value = value;
  return entry;
}

void tian_entry_free (TianEntry *entry,
                      TianFreeFunc key_free_func,
                      TianFreeFunc value_free_func)
{
  if (key_free_func)
    key_free_func (entry->key);

  if (value_free_func)
    value_free_func (entry->value);

  free (entry);
}

/*****************************************************************************/

bool tian_direct_equal (const void *a, const void *b)
{
  return a == b;
}

bool tian_str_equal (const void *s1, const void *s2)
{
  return strcmp (s1, s2) == 0;
}

/*****************************************************************************/

/* RAND_MAX:      0x7fffffffU  This is a prime number */
/* UINT32_MAX:    0xffffffffU */
#if defined(__FreeBSD__) || defined(__linux__)
  #define PRIME     0xfffffffbU /* for arc4random_uniform() or getrandom() */
#else
  #define PRIME     0x7fffffffU /* for rand() */
#endif
#define MAX(a, b) (((a) > (b)) ? (a) : (b))

typedef struct _TianHashable TianHashable;
struct _TianHashable {
  unsigned random;
  unsigned len;
};

typedef unsigned (* TianUHashFunc) (TianHashable *hashable,
                                    const void   *key);

typedef struct _TianBucket TianBucket;
struct _TianBucket {
  unsigned    random;      /* 1 <= k <= prime - 1 */
  unsigned    entries_len; /* entries_len = 2 * m * (m - 1) */
  unsigned    m;           /* entries_len = 2 * m * (m - 1) */
  unsigned    b;           /* rehash: b = lists[i]->len, insert: b = b + 1 */
  TianEntry **entries;
};

/* constant */
#define DYNFECT_C  1 /* m = (1 + c) * MAX (count, 4) in rehash() */
#define DYNFECT_S  1 /* buckets_len = S(M) = DYNFECT_S * dynfect->m */

typedef struct _TianDynfect TianDynfect;
struct _TianDynfect {
  unsigned      random;      /* 1 <= k <= prime - 1 */
  unsigned      buckets_len; /* buckets_len = S(M) = DYNFECT_S * dynfect->m */
  TianUHashFunc hash_func;
  TianEqualFunc key_equal_func;
  TianFreeFunc  key_free_func;
  TianFreeFunc  value_free_func;
  TianBucket   *buckets;
  unsigned      op_count; /* count++ when add, remove */
  unsigned      m;        /* m = (1 + c) * MAX (count, 4) in rehash() */
  unsigned      n_items;
};

/* universal hash for integer */
static unsigned tian_hashable_uhash_int (TianHashable *hashable,
                                         const void   *key)
{
  /* h(x) = (kx mod p) mod s; 1 <= k <= prime - 1 */
  /* prime >= hashable->len */
  return (((unsigned long) hashable->random * (unsigned long) key) %
          (unsigned long) PRIME) % (unsigned long) hashable->len;
}

static unsigned tian_dynfect_gen_random ()
{
#if !defined(__FreeBSD__) && !defined(__linux__)
  #if RAND_MAX != 0x7fffffffU
    #error "Debug this source code"
  #endif
  static long seed = -1;

  if (seed < 0)
  {
    /* This is insecure */
    seed = time (NULL);
    srand (seed);
  }
#endif
  /* h(x) = (kx mod p) mod s; 1 <= k <= prime - 1 */
#ifdef __FreeBSD__
  return arc4random_uniform (PRIME - 1) + 1; /* uint32_t */
#elif __linux__
  uint32_t buf;
  (void) getrandom (&buf, sizeof (unsigned), 0);
  return buf % (PRIME - 1) + 1;
#else
  return rand() % (PRIME - 1) + 1; /* int */
#endif
}

static bool tian_dynfect_condition (TianDynfect *dynfect)
{
  unsigned sum = 0;
  /* 32 * M * M / S(M) + 4 * M */
  /* S(M) is buckets_len */
  /* buckets_len is DYNFECT_S * dynfect->m */
  unsigned bound = 32 * dynfect->m / DYNFECT_S + 4 * dynfect->m;

  for (unsigned i = 0; i < dynfect->buckets_len; i++)
  {
    sum = sum + dynfect->buckets[i].m;

    if (sum > bound)
      return false;
  }

  return true;
}

static bool tian_dynfect_inject (TianDynfect *dynfect,
                                 TianArray   *list,
                                 TianBucket  *bucket)
{
  TianEntry *entry;
  unsigned   index;

  bucket->random = tian_dynfect_gen_random ();

  for (unsigned i = 0; i < list->len; i++)
  {
    entry = list->data[i];
    index = dynfect->hash_func ((TianHashable *) bucket, entry->key);

    if (bucket->entries[index])
      return false;

    bucket->entries[index] = entry;
  }

  return true;
}

static bool tian_dynfect_contains (TianDynfect *dynfect,
                                   const void  *key)
{
  unsigned i = dynfect->hash_func ((TianHashable *) dynfect, key);
  TianBucket *bucket = &dynfect->buckets[i];

  if (bucket->entries_len > 0)
  {
    int pos = dynfect->hash_func ((TianHashable *) bucket, key);
    TianEntry *entry = bucket->entries[pos];

    if (entry && dynfect->key_equal_func (entry->key, key))
      return true;
  }

  return false;
}

static void tian_dynfect_rehash (TianDynfect *dynfect, TianEntry *new_entry)
{
  TianArray  *list;
  TianArray **lists;

  list = tian_array_new (NULL);

  for (unsigned i = 0; i < dynfect->buckets_len; i++)
  {
    TianBucket *bucket = &dynfect->buckets[i];
    for (unsigned j = 0; j < bucket->entries_len; j++)
      if (bucket->entries[j])
        tian_array_add (list, bucket->entries[j]);

    free (bucket->entries);
  }

  if (new_entry)
  {
    tian_array_add (list, new_entry);
    dynfect->n_items++;
  }

  dynfect->op_count = list->len;
  dynfect->m = (1 + DYNFECT_C) * MAX (dynfect->op_count, 4);
  dynfect->buckets_len = DYNFECT_S * dynfect->m;

  dynfect->buckets = realloc (dynfect->buckets,
                              dynfect->buckets_len * sizeof (TianBucket));
  memset (dynfect->buckets, 0, dynfect->buckets_len * sizeof (TianBucket));
  lists = calloc (dynfect->buckets_len, sizeof (TianArray *));

  do {
    dynfect->random = tian_dynfect_gen_random ();

    for (unsigned i = 0; i < dynfect->buckets_len; i++)
      if (lists[i] && lists[i]->len)
        tian_array_clear (lists[i]);

    for (unsigned i = 0; i < list->len; i++)
    {
      TianEntry *entry = list->data[i];
      unsigned pos = dynfect->hash_func ((TianHashable *) dynfect, entry->key);

      if (!lists[pos])
        lists[pos] = tian_array_new (NULL);

      tian_array_add (lists[pos], entry);
    }

    for (unsigned i = 0; i < dynfect->buckets_len; i++)
    {
      TianBucket *bucket = &dynfect->buckets[i];
      memset (bucket, 0, sizeof (TianBucket));
      bucket->b = lists[i] ? lists[i]->len : 0;
      bucket->m = 2 * bucket->b;
      bucket->entries_len = 2 * bucket->m * (bucket->m - 1);
    }
  } while (!tian_dynfect_condition (dynfect));

  tian_array_free (list, true);

  for (unsigned i = 0; i < dynfect->buckets_len; i++)
  {
    TianBucket *bucket = &dynfect->buckets[i];

    if (lists[i] && (lists[i]->len > 0))
    {
      bucket->entries = calloc (bucket->entries_len, sizeof (TianEntry *));

      do {
        memset (bucket->entries, 0, bucket->entries_len * sizeof (TianEntry *));
      } while (!tian_dynfect_inject (dynfect, lists[i], bucket));
    }

    if (lists[i])
      tian_array_free (lists[i], true);
  }

  free (lists);
}

static TianDynfect *tian_dynfect_new (TianUHashFunc hash_func,
                                      TianEqualFunc key_equal_func,
                                      TianFreeFunc  key_free_func,
                                      TianFreeFunc  value_free_func)
{
  TianDynfect *dynfect = malloc (sizeof (TianDynfect));

  dynfect->hash_func       = hash_func;
  dynfect->key_equal_func  = key_equal_func;
  dynfect->key_free_func   = key_free_func;
  dynfect->value_free_func = value_free_func;
  dynfect->op_count        = 0;
  dynfect->buckets_len     = 0;
  dynfect->buckets         = NULL;
  dynfect->n_items         = 0;
  dynfect->m               = 0;
  tian_dynfect_rehash (dynfect, NULL);

  return dynfect;
}

static bool tian_dynfect_insert (TianDynfect *dynfect,
                                 void        *key,
                                 void        *value)
{
  unsigned    index = dynfect->hash_func ((TianHashable *) dynfect, key);
  unsigned    pos;
  TianBucket *bucket = &dynfect->buckets[index];

  if (bucket->entries_len > 0)
  {
    pos = dynfect->hash_func ((TianHashable *) bucket, key);
    TianEntry *entry = bucket->entries[pos];

    if (entry)
    {
      if (dynfect->key_equal_func (entry->key, key))
      {
        if (dynfect->key_free_func)
          dynfect->key_free_func (entry->key);

        if (dynfect->value_free_func)
          dynfect->value_free_func (entry->value);

        entry->key   = key;
        entry->value = value;

        return false;
      }
    }
  }

  dynfect->op_count++;

  if (dynfect->op_count > dynfect->m)
  {
    tian_dynfect_rehash (dynfect, tian_entry_new (key, value));

    return true;
  }

  bucket->b++;

  if (bucket->b <= bucket->m)
  {
    if (!bucket->entries[pos])
    {
      bucket->entries[pos] = tian_entry_new (key, value);
    }
    else
    {
      TianArray *list;

      list = tian_array_new (NULL);

      for (unsigned i = 0; i < bucket->entries_len; i++)
        if (bucket->entries[i])
          tian_array_add (list, bucket->entries[i]);

      tian_array_add (list, tian_entry_new (key, value));

      do {
        if (bucket->entries == NULL)
          bucket->entries = calloc (bucket->entries_len, sizeof (TianEntry *));
        else
          memset (bucket->entries,
                  0, bucket->entries_len * sizeof (TianEntry *));
      } while (!tian_dynfect_inject (dynfect, list, bucket));

      tian_array_free (list, true);
    }
  }
  else
  {
    if (tian_dynfect_condition (dynfect))
    {
      TianArray *list;

      list = tian_array_new (NULL);

      if (bucket->entries)
        for (unsigned j = 0; j < bucket->entries_len; j++)
          if (bucket->entries[j])
            tian_array_add (list, bucket->entries[j]);

      tian_array_add (list, tian_entry_new (key, value));

      bucket->m = 2 * MAX (bucket->m, 1);
      bucket->entries_len = 2 * (bucket->m) * (bucket->m - 1);
      bucket->entries = realloc (bucket->entries,
                                 bucket->entries_len * sizeof (TianEntry *));
      do {
        memset (bucket->entries, 0, bucket->entries_len * sizeof (TianEntry *));
      } while (!tian_dynfect_inject (dynfect, list, bucket));

      tian_array_free (list, true);
    }
    else
    {
      tian_dynfect_rehash (dynfect, tian_entry_new (key, value));
    }
  }

  dynfect->n_items++;

  return true;
}

static bool tian_dynfect_remove (TianDynfect *dynfect, const void *key)
{
  dynfect->op_count++;

  unsigned i = dynfect->hash_func ((TianHashable *) dynfect, key);
  TianBucket *bucket = &dynfect->buckets[i];

  if (bucket->entries_len < 1)
    return false;

  int pos = dynfect->hash_func ((TianHashable *) bucket, key);
  TianEntry *entry = bucket->entries[pos];

  if (entry == NULL)
    return false;

  tian_entry_free (entry,
                   dynfect->key_free_func,
                   dynfect->value_free_func);
  bucket->entries[pos] = NULL;
  dynfect->n_items--;

  if (dynfect->op_count >= dynfect->m)
    tian_dynfect_rehash (dynfect, NULL);

  return true;
}

static void *tian_dynfect_lookup (TianDynfect *dynfect,
                                  const void  *key)
{
  unsigned i = dynfect->hash_func ((TianHashable *) dynfect, key);
  TianBucket *bucket = &dynfect->buckets[i];

  if (bucket->entries_len > 0)
  {
    int pos = dynfect->hash_func ((TianHashable *) bucket, key);
    TianEntry *entry = bucket->entries[pos];

    if (entry && dynfect->key_equal_func (entry->key, key))
      return entry->value;
  }

  return NULL;
}

static void tian_dynfect_free (TianDynfect *dynfect)
{
  for (unsigned i = 0; i < dynfect->buckets_len; i++)
  {
    TianBucket *bucket = &dynfect->buckets[i];

    for (unsigned j = 0; j < bucket->entries_len; j++)
    {
      TianEntry *entry = bucket->entries[j];

      if (entry)
        tian_entry_free (entry,
                         dynfect->key_free_func,
                         dynfect->value_free_func);
    }

    free (bucket->entries);
  }

  free (dynfect->buckets);
  free (dynfect);
}

unsigned tian_dynfect_size (TianDynfect *dynfect)
{
  return dynfect->n_items;
}

/*****************************************************************************/

int main (int argc, char **argv)
{
  puts ("====== start of dynfect test");

  struct timespec ts1, ts2;
  TianDynfect *dynfect;

  dynfect = tian_dynfect_new (tian_hashable_uhash_int, tian_direct_equal, NULL, NULL);

  /* replace test */
  clock_gettime (CLOCK_MONOTONIC, &ts1);

  for (uint32_t i = 0; i < 1000; i++)
    tian_dynfect_insert (dynfect, (void *) (uint64_t) 10,
                                  (void *) (uint64_t) i);

  clock_gettime (CLOCK_MONOTONIC, &ts2);

  printf ("tian: added 1000 times: %ld ms\n",
    ((ts2.tv_sec * 1000000000 + ts2.tv_nsec) - (ts1.tv_sec * 1000000000 + ts1.tv_nsec)) / 1000000);
  printf ("tian_hash_table_size (dynfect): %u\n", tian_dynfect_size (dynfect));
  assert (tian_dynfect_size (dynfect) == 1);

  for (uint32_t i = 0; i < 1000; i++)
  {
    uint32_t retval;
    retval = (uint32_t) (uint64_t) tian_dynfect_lookup (dynfect, (void *) (uint64_t) 10);
    assert (retval == 999);
    assert (tian_dynfect_contains (dynfect, (void *) (uint64_t) 10));
  }

  assert (tian_dynfect_remove (dynfect, (void *) 10));

  clock_gettime (CLOCK_MONOTONIC, &ts1);

  for (uint32_t i = 0; i < 10000; i++)
    tian_dynfect_insert (dynfect, (void *) (uint64_t) i,
                                  (void *) (uint64_t) i);

  clock_gettime (CLOCK_MONOTONIC, &ts2);

  printf ("tian: added 10000 times: %ld ms\n",
    ((ts2.tv_sec * 1000000000 + ts2.tv_nsec) - (ts1.tv_sec * 1000000000 + ts1.tv_nsec)) / 1000000);
  printf ("tian_hash_table_size (dynfect): %u\n", tian_dynfect_size (dynfect));


  clock_gettime (CLOCK_MONOTONIC, &ts1);

  for (uint32_t i = 0; i < 10000; i++)
  {
    uint32_t retval;
    retval = (uint32_t) (uint64_t) tian_dynfect_lookup (dynfect, (void *) (uint64_t) i);
    assert (i == retval);
  }

  printf ("tian: lookup 10000 times: %ld ms\n",
    ((ts2.tv_sec * 1000000000 + ts2.tv_nsec) - (ts1.tv_sec * 1000000000 + ts1.tv_nsec)) / 1000000);
  printf ("tian_hash_table_size (dynfect): %u\n", tian_dynfect_size (dynfect));

  tian_dynfect_free (dynfect);

  return 0;
}