From d9f901f36d9ece8c751f6fcd45accc723885a70e Mon Sep 17 00:00:00 2001
From: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
Date: Sat, 19 Aug 2017 15:29:52 -0500
Subject: [PATCH] Squashed commit of the following:

commit 50f414a45d58fcab664ff662dd27befcfa0fdd95
Author: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
Date:   Sat Aug 19 13:43:35 2017 -0500

    Converted file_id_t set to unordered_set with custom hash

commit 83ef2dd7cc1bc3e4fdf0b2d3546d6811326cc3c9
Author: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
Date:   Sat Aug 19 13:43:14 2017 -0500

    Converted remaining set<wcstring> to unordered_set<wcstring>

commit 053da88f933f27505b3cf4810402e2a2be070203
Author: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
Date:   Sat Aug 19 13:29:21 2017 -0500

    Switched function sets to unordered_set

commit d469742a14ac99599022a9258cda8255178826b5
Author: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
Date:   Sat Aug 19 13:21:32 2017 -0500

    Converted list of modified variables to an unordered set

commit 5c06f866beeafb23878b1a932c7cd2558412c283
Author: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
Date:   Sat Aug 19 13:15:20 2017 -0500

    Convert const_string_set_t to std::unordered_set

    As it is a readonly-list of raw character pointer strings (not
    wcstring), this necessitated the addition of a hashing function since
    the C++ standard library does not come with a char pointer hash
    function.

    To that end, a zlib-licensed [0] port of the excellent, lightweight
    XXHash family of 32- and 64-bit hashing algorithms in the form of a C++
    header-only include library has been included. XXHash32/64 is pretty
    much universally the fastest hashing library for general purpose
    applications, and has been thoroughly vetted and is used in countless
    open source projects. The single-header version of this library makes it
    a lot simpler to include in the fish project, and the license
    compatibility with fish' GPLv2 and the zero-lib nature should make it an
    easy decision.

    std::unordered_set brings a massive speedup as compared to the default
    std::set, and the further use of the fast XXHash library to provide the
    string hashing should make all forms of string lookups in fish
    significantly faster (to a user-noticeable extent).

    0: http://create.stephan-brumme.com/about.html

commit 30d7710be8f0c23a4d42f7e713fcb7850f99036e
Author: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
Date:   Sat Aug 19 12:29:39 2017 -0500

    Using std::unordered_set for completions backing store

    While the completions shown to the user are sorted, their storage in
    memory does not need to be since they are re-sorted before they are
    shown in completions.cpp.

commit 695e83331d7a60ba188e57f6ea0d9b6da54860c6
Author: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
Date:   Sat Aug 19 12:06:53 2017 -0500

    Updated is_loading to use unordered_set
---
 src/autoload.cpp             |   5 +-
 src/autoload.h               |   3 +-
 src/builtin_set.cpp          |   2 +-
 src/complete.cpp             |  30 +++---
 src/env.cpp                  |  18 +++-
 src/env_universal_common.cpp |   4 +-
 src/env_universal_common.h   |   4 +-
 src/function.cpp             |  13 ++-
 src/highlight.cpp            |   4 +-
 src/history.cpp              |   5 +-
 src/history.h                |   4 +-
 src/screen.h                 |   4 +-
 src/wildcard.cpp             |   6 +-
 src/wutil.h                  |  13 +++
 src/xxhash32.h               | 155 +++++++++++++++++++++++++++++++
 src/xxhash64.h               | 174 +++++++++++++++++++++++++++++++++++
 16 files changed, 402 insertions(+), 42 deletions(-)
 create mode 100644 src/xxhash32.h
 create mode 100644 src/xxhash64.h
diff --git a/src/autoload.cpp b/src/autoload.cpp
index eb252277f..f701dcdd3 100644
--- a/src/autoload.cpp
+++ b/src/autoload.cpp
@@ -84,9 +84,8 @@ int autoload_t::load(const wcstring &cmd, bool reload) {
     // Mark that we're loading this. Hang onto the iterator for fast erasing later. Note that
     // std::set has guarantees about not invalidating iterators, so this is safe to do across the
     // callouts below.
-    typedef std::set<wcstring>::iterator set_iterator_t;
-    std::pair<set_iterator_t, bool> insert_result = is_loading_set.insert(cmd);
-    set_iterator_t where = insert_result.first;
+    auto insert_result = is_loading_set.insert(cmd);
+    auto where = insert_result.first;
     bool inserted = insert_result.second;
 
     // Warn and fail on infinite recursion. It's OK to do this because this function is only called
diff --git a/src/autoload.h b/src/autoload.h
index 0705c5616..fe3372cb2 100644
--- a/src/autoload.h
+++ b/src/autoload.h
@@ -6,6 +6,7 @@
 #include <time.h>
 
 #include <set>
+#include <unordered_set>
 
 #include "common.h"
 #include "env.h"
@@ -54,7 +55,7 @@ class autoload_t : public lru_cache_t<autoload_t, autoload_function_t> {
     wcstring_list_t last_path_tokenized;
     /// A table containing all the files that are currently being loaded.
     /// This is here to help prevent recursion.
-    std::set<wcstring> is_loading_set;
+    std::unordered_set<wcstring> is_loading_set;
     // Function invoked when a command is removed
     typedef void (*command_removed_function_t)(const wcstring &);
     const command_removed_function_t command_removed;
diff --git a/src/builtin_set.cpp b/src/builtin_set.cpp
index b74df2f76..e075ee738 100644
--- a/src/builtin_set.cpp
+++ b/src/builtin_set.cpp
@@ -414,7 +414,7 @@ static void erase_values(wcstring_list_t &list, const std::vector<long> &indexes
 
     // Now walk the set backwards, so we encounter larger indexes first, and remove elements at the
     // given (1-based) indexes.
-    std::set<long>::const_reverse_iterator iter;
+    decltype(indexes_set)::const_reverse_iterator iter;
     for (iter = indexes_set.rbegin(); iter != indexes_set.rend(); ++iter) {
         long val = *iter;
         if (val > 0 && (size_t)val <= list.size()) {
diff --git a/src/complete.cpp b/src/complete.cpp
index 48c54b973..f937a0f9f 100644
--- a/src/complete.cpp
+++ b/src/complete.cpp
@@ -158,17 +158,22 @@ class completion_entry_t {
 };
 
 /// Set of all completion entries.
-struct completion_entry_set_comparer {
-    /** Comparison for std::set */
-    bool operator()(const completion_entry_t &p1, const completion_entry_t &p2) const {
-        // Paths always come last for no particular reason.
-        if (p1.cmd_is_path != p2.cmd_is_path) {
-            return p1.cmd_is_path < p2.cmd_is_path;
+namespace std {
+    template<>
+    struct hash<completion_entry_t> {
+        size_t operator()(const completion_entry_t &c) const {
+            std::hash<wcstring> hasher;
+            return hasher((wcstring) c.cmd);
         }
-        return p1.cmd < p2.cmd;
-    }
-};
-typedef std::set<completion_entry_t, completion_entry_set_comparer> completion_entry_set_t;
+    };
+    template <>
+    struct equal_to<completion_entry_t> {
+        bool operator()(const completion_entry_t &c1, const completion_entry_t &c2) const {
+            return c1.cmd == c2.cmd;
+        }
+    };
+}
+typedef std::unordered_set<completion_entry_t> completion_entry_set_t;
 static completion_entry_set_t completion_set;
 
 /// Comparison function to sort completions by their order field.
@@ -417,8 +422,7 @@ bool completer_t::condition_test(const wcstring &condition) {
 static completion_entry_t &complete_get_exact_entry(const wcstring &cmd, bool cmd_is_path) {
     ASSERT_IS_LOCKED(completion_lock);
 
-    std::pair<completion_entry_set_t::iterator, bool> ins =
-        completion_set.insert(completion_entry_t(cmd, cmd_is_path));
+    auto ins = completion_set.emplace(completion_entry_t(cmd, cmd_is_path));
 
     // NOTE SET_ELEMENTS_ARE_IMMUTABLE: Exposing mutable access here is only okay as long as callers
     // do not change any field that matters to ordering - affecting order without telling std::set
@@ -1610,7 +1614,7 @@ wcstring_list_t complete_get_wrap_chain(const wcstring &command) {
     const wrapper_map_t &wraps = wrap_map();
 
     wcstring_list_t result;
-    std::set<wcstring> visited;            // set of visited commands
+    std::unordered_set<wcstring> visited;            // set of visited commands
     wcstring_list_t to_visit(1, command);  // stack of remaining-to-visit commands
 
     wcstring target;
diff --git a/src/env.cpp b/src/env.cpp
index 0030e3aa0..69c62fb1b 100644
--- a/src/env.cpp
+++ b/src/env.cpp
@@ -33,6 +33,7 @@
 #include <set>
 #include <type_traits>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -54,6 +55,7 @@
 #include "sanity.h"
 #include "screen.h"
 #include "wutil.h"  // IWYU pragma: keep
+#include "xxhash64.h"
 
 #define DEFAULT_TERM1 "ansi"
 #define DEFAULT_TERM2 "dumb"
@@ -325,7 +327,21 @@ static env_universal_t *uvars() { return s_universal_variables; }
 struct const_string_set_comparer {
     bool operator()(const wchar_t *a, const wchar_t *b) { return wcscmp(a, b) < 0; }
 };
-typedef std::set<const wchar_t *, const_string_set_comparer> const_string_set_t;
+namespace std {
+    template<>
+    struct hash<const wchar_t *> {
+        size_t operator()(const wchar_t *p) const {
+            return XXHash64::hash(p, wcslen(p), 0);
+        }
+    };
+    template <>
+    struct equal_to<const wchar_t *> {
+        bool operator()(const wchar_t *a, const wchar_t *b) const {
+            return wcscmp(a, b) == 0;
+        }
+    };
+}
+typedef std::unordered_set<const wchar_t *> const_string_set_t;
 
 /// Table of variables that may not be set using the set command.
 static const_string_set_t env_read_only;
diff --git a/src/env_universal_common.cpp b/src/env_universal_common.cpp
index 94120da41..570c8b08b 100644
--- a/src/env_universal_common.cpp
+++ b/src/env_universal_common.cpp
@@ -375,9 +375,7 @@ void env_universal_t::generate_callbacks(const var_table_t &new_vars,
 
 void env_universal_t::acquire_variables(var_table_t &vars_to_acquire) {
     // Copy modified values from existing vars to vars_to_acquire.
-    for (std::set<wcstring>::iterator iter = this->modified.begin(); iter != this->modified.end();
-         ++iter) {
-        const wcstring &key = *iter;
+    for (const auto &key : this->modified) {
         var_table_t::iterator src_iter = this->vars.find(key);
         if (src_iter == this->vars.end()) {
             /* The value has been deleted. */
diff --git a/src/env_universal_common.h b/src/env_universal_common.h
index 5b645486a..2755d3897 100644
--- a/src/env_universal_common.h
+++ b/src/env_universal_common.h
@@ -6,7 +6,7 @@
 #include <stdio.h>
 
 #include <memory>
-#include <set>
+#include <unordered_set>
 #include <vector>
 
 #include "common.h"
@@ -34,7 +34,7 @@ class env_universal_t {
 
     // Keys that have been modified, and need to be written. A value here that is not present in
     // vars indicates a deleted value.
-    std::set<wcstring> modified;
+    std::unordered_set<wcstring> modified;
 
     // Path that we save to. If empty, use the default.
     const wcstring explicit_vars_path;
diff --git a/src/function.cpp b/src/function.cpp
index 6571f3081..e0e028a88 100644
--- a/src/function.cpp
+++ b/src/function.cpp
@@ -12,7 +12,7 @@
 
 #include <map>
 #include <memory>
-#include <set>
+#include <unordered_set>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -33,7 +33,7 @@ typedef std::unordered_map<wcstring, function_info_t> function_map_t;
 static function_map_t loaded_functions;
 
 /// Functions that shouldn't be autoloaded (anymore).
-static std::set<wcstring> function_tombstones;
+static std::unordered_set<wcstring> function_tombstones;
 
 /// Lock for functions.
 static std::recursive_mutex functions_lock;
@@ -76,7 +76,7 @@ static int load(const wcstring &name) {
 }
 
 /// Insert a list of all dynamically loaded functions into the specified list.
-static void autoload_names(std::set<wcstring> &names, int get_hidden) {
+static void autoload_names(std::unordered_set<wcstring> &names, int get_hidden) {
     size_t i;
 
     const env_var_t path_var = env_get(L"fish_function_path");
@@ -282,13 +282,12 @@ bool function_copy(const wcstring &name, const wcstring &new_name) {
 }
 
 wcstring_list_t function_get_names(int get_hidden) {
-    std::set<wcstring> names;
+    std::unordered_set<wcstring> names;
     scoped_rlock locker(functions_lock);
     autoload_names(names, get_hidden);
 
-    function_map_t::const_iterator iter;
-    for (iter = loaded_functions.begin(); iter != loaded_functions.end(); ++iter) {
-        const wcstring &name = iter->first;
+    for (const auto &func : loaded_functions) {
+        const wcstring &name = func.first;
 
         // Maybe skip hidden.
         if (!get_hidden && (name.empty() || name.at(0) == L'_')) {
diff --git a/src/highlight.cpp b/src/highlight.cpp
index bbd3abac9..ae8d7955d 100644
--- a/src/highlight.cpp
+++ b/src/highlight.cpp
@@ -10,10 +10,10 @@
 
 #include <algorithm>
 #include <memory>
-#include <set>
 #include <string>
 #include <type_traits>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 #include "builtin.h"
@@ -146,7 +146,7 @@ bool is_potential_path(const wcstring &potential_path_fragment, const wcstring_l
 
     // Don't test the same path multiple times, which can happen if the path is absolute and the
     // CDPATH contains multiple entries.
-    std::set<wcstring> checked_paths;
+    std::unordered_set<wcstring> checked_paths;
 
     // Keep a cache of which paths / filesystems are case sensitive.
     case_sensitivity_cache_t case_sensitivity_cache;
diff --git a/src/history.cpp b/src/history.cpp
index 89a2f8908..71c678563 100644
--- a/src/history.cpp
+++ b/src/history.cpp
@@ -26,6 +26,7 @@
 #include <map>
 #include <numeric>
 #include <type_traits>
+#include <unordered_set>
 
 #include "common.h"
 #include "env.h"
@@ -846,7 +847,7 @@ void history_t::get_string_representation(wcstring *result, const wcstring &sepa
 
     bool first = true;
 
-    std::set<wcstring> seen;
+    std::unordered_set<wcstring> seen;
 
     // If we have a pending item, we skip the first encountered (i.e. last) new item.
     bool next_is_pending = this->has_pending_item;
@@ -1161,7 +1162,7 @@ void history_t::clear_file_state() {
 void history_t::compact_new_items() {
     // Keep only the most recent items with the given contents. This algorithm could be made more
     // efficient, but likely would consume more memory too.
-    std::set<wcstring> seen;
+    std::unordered_set<wcstring> seen;
     size_t idx = new_items.size();
     while (idx--) {
         const history_item_t &item = new_items[idx];
diff --git a/src/history.h b/src/history.h
index 643d7d230..9ff339a32 100644
--- a/src/history.h
+++ b/src/history.h
@@ -12,7 +12,7 @@
 
 #include <deque>
 #include <memory>
-#include <set>
+#include <unordered_set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -139,7 +139,7 @@ class history_t {
     uint32_t disable_automatic_save_counter;
 
     // Deleted item contents.
-    std::set<wcstring> deleted_items;
+    std::unordered_set<wcstring> deleted_items;
 
     // The mmaped region for the history file.
     const char *mmap_start;
diff --git a/src/screen.h b/src/screen.h
index c06ffaf12..4c984d035 100644
--- a/src/screen.h
+++ b/src/screen.h
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <cstddef>
 #include <memory>
-#include <set>
+#include <unordered_set>
 #include <unordered_map>
 #include <vector>
 
@@ -203,7 +203,7 @@ size_t escape_code_length(const wchar_t *code);
 class cached_esc_sequences_t {
    private:
     // Cached escape sequences we've already detected in the prompt and similar strings.
-    std::set<wcstring> cache;
+    std::unordered_set<wcstring> cache;
     // The escape sequence lengths we've cached. My original implementation used min and max
     // length variables. The cache was then iterated over using a loop like this:
     // `for (size_t l = min; l <= max; l++)`.
diff --git a/src/wildcard.cpp b/src/wildcard.cpp
index f2537ade6..3089d4692 100644
--- a/src/wildcard.cpp
+++ b/src/wildcard.cpp
@@ -11,8 +11,8 @@
 #include <wchar.h>
 
 #include <memory>
-#include <set>
 #include <string>
+#include <unordered_set>
 #include <utility>
 
 #include "common.h"
@@ -439,9 +439,9 @@ class wildcard_expander_t {
     // The working directory to resolve paths against
     const wcstring working_directory;
     // The set of items we have resolved, used to efficiently avoid duplication.
-    std::set<wcstring> completion_set;
+    std::unordered_set<wcstring> completion_set;
     // The set of file IDs we have visited, used to avoid symlink loops.
-    std::set<file_id_t> visited_files;
+    std::unordered_set<file_id_t> visited_files;
     // Flags controlling expansion.
     const expand_flags_t flags;
     // Resolved items get inserted into here. This is transient of course.
diff --git a/src/wutil.h b/src/wutil.h
index 2b9470e72..4ef5969e6 100644
--- a/src/wutil.h
+++ b/src/wutil.h
@@ -143,6 +143,19 @@ struct file_id_t {
     int compare_file_id(const file_id_t &rhs) const;
 };
 
+#ifndef HASH_FILE_ID
+#define HASH_FILE_ID 1
+#include "xxhash64.h"
+namespace std {
+    template<>
+    struct hash<file_id_t> {
+        size_t operator()(const file_id_t &f) const {
+            return XXHash64::hash(&f, sizeof(f), 0);
+        }
+    };
+}
+#endif
+
 file_id_t file_id_for_fd(int fd);
 file_id_t file_id_for_path(const wcstring &path);
 
diff --git a/src/xxhash32.h b/src/xxhash32.h
new file mode 100644
index 000000000..acabfb34c
--- /dev/null
+++ b/src/xxhash32.h
@@ -0,0 +1,155 @@
+// //////////////////////////////////////////////////////////
+// xxhash32.h
+// Copyright (c) 2016 Stephan Brumme. All rights reserved.
+// see http://create.stephan-brumme.com/disclaimer.html
+//
+#pragma once
+#include <stdint.h> // for uint32_t and uint64_t
+/// XXHash (32 bit), based on Yann Collet's descriptions, see http://cyan4973.github.io/xxHash/
+/** How to use:
+    uint32_t myseed = 0;
+    XXHash32 myhash(myseed);
+    myhash.add(pointerToSomeBytes,     numberOfBytes);
+    myhash.add(pointerToSomeMoreBytes, numberOfMoreBytes); // call add() as often as you like to ...
+    // and compute hash:
+    uint32_t result = myhash.hash();
+    // or all of the above in one single line:
+    uint32_t result2 = XXHash32::hash(mypointer, numBytes, myseed);
+    Note: my code is NOT endian-aware !
+**/
+class XXHash32
+{
+public:
+  /// create new XXHash (32 bit)
+  /** @param seed your seed value, even zero is a valid seed and e.g. used by LZ4 **/
+  explicit XXHash32(uint32_t seed)
+  {
+    state[0] = seed + Prime1 + Prime2;
+    state[1] = seed + Prime2;
+    state[2] = seed;
+    state[3] = seed - Prime1;
+    bufferSize  = 0;
+    totalLength = 0;
+  }
+  /// add a chunk of bytes
+  /** @param  input  pointer to a continuous block of data
+      @param  length number of bytes
+      @return false if parameters are invalid / zero **/
+  bool add(const void* input, uint64_t length)
+  {
+    // no data ?
+    if (!input || length == 0)
+      return false;
+    totalLength += length;
+    // byte-wise access
+    const unsigned char* data = (const unsigned char*)input;
+    // unprocessed old data plus new data still fit in temporary buffer ?
+    if (bufferSize + length < MaxBufferSize)
+    {
+      // just add new data
+      while (length-- > 0)
+        buffer[bufferSize++] = *data++;
+      return true;
+    }
+    // point beyond last byte
+    const unsigned char* stop      = data + length;
+    const unsigned char* stopBlock = stop - MaxBufferSize;
+    // some data left from previous update ?
+    if (bufferSize > 0)
+    {
+      // make sure temporary buffer is full (16 bytes)
+      while (bufferSize < MaxBufferSize)
+        buffer[bufferSize++] = *data++;
+      // process these 16 bytes (4x4)
+      process(buffer, state[0], state[1], state[2], state[3]);
+    }
+    // copying state to local variables helps optimizer A LOT
+    uint32_t s0 = state[0], s1 = state[1], s2 = state[2], s3 = state[3];
+    // 16 bytes at once
+    while (data <= stopBlock)
+    {
+      // local variables s0..s3 instead of state[0]..state[3] are much faster
+      process(data, s0, s1, s2, s3);
+      data += 16;
+    }
+    // copy back
+    state[0] = s0; state[1] = s1; state[2] = s2; state[3] = s3;
+    // copy remainder to temporary buffer
+    bufferSize = stop - data;
+    for (unsigned int i = 0; i < bufferSize; i++)
+      buffer[i] = data[i];
+    // done
+    return true;
+  }
+  /// get current hash
+  /** @return 32 bit XXHash **/
+  uint32_t hash() const
+  {
+    uint32_t result = (uint32_t)totalLength;
+    // fold 128 bit state into one single 32 bit value
+    if (totalLength >= MaxBufferSize)
+      result += rotateLeft(state[0],  1) +
+                rotateLeft(state[1],  7) +
+                rotateLeft(state[2], 12) +
+                rotateLeft(state[3], 18);
+    else
+      // internal state wasn't set in add(), therefore original seed is still stored in state2
+      result += state[2] + Prime5;
+    // process remaining bytes in temporary buffer
+    const unsigned char* data = buffer;
+    // point beyond last byte
+    const unsigned char* stop = data + bufferSize;
+    // at least 4 bytes left ? => eat 4 bytes per step
+    for (; data + 4 <= stop; data += 4)
+      result = rotateLeft(result + *(uint32_t*)data * Prime3, 17) * Prime4;
+    // take care of remaining 0..3 bytes, eat 1 byte per step
+    while (data != stop)
+      result = rotateLeft(result +        (*data++) * Prime5, 11) * Prime1;
+    // mix bits
+    result ^= result >> 15;
+    result *= Prime2;
+    result ^= result >> 13;
+    result *= Prime3;
+    result ^= result >> 16;
+    return result;
+  }
+  /// combine constructor, add() and hash() in one static function (C style)
+  /** @param  input  pointer to a continuous block of data
+      @param  length number of bytes
+      @param  seed your seed value, e.g. zero is a valid seed and used by LZ4
+      @return 32 bit XXHash **/
+  static uint32_t hash(const void* input, uint64_t length, uint32_t seed)
+  {
+    XXHash32 hasher(seed);
+    hasher.add(input, length);
+    return hasher.hash();
+  }
+private:
+  /// magic constants :-)
+  static const uint32_t Prime1 = 2654435761U;
+  static const uint32_t Prime2 = 2246822519U;
+  static const uint32_t Prime3 = 3266489917U;
+  static const uint32_t Prime4 =  668265263U;
+  static const uint32_t Prime5 =  374761393U;
+  /// temporarily store up to 15 bytes between multiple add() calls
+  static const uint32_t MaxBufferSize = 15+1;
+  // internal state and temporary buffer
+  uint32_t      state[4]; // state[2] == seed if totalLength < MaxBufferSize
+  unsigned char buffer[MaxBufferSize];
+  unsigned int  bufferSize;
+  uint64_t      totalLength;
+  /// rotate bits, should compile to a single CPU instruction (ROL)
+  static inline uint32_t rotateLeft(uint32_t x, unsigned char bits)
+  {
+    return (x << bits) | (x >> (32 - bits));
+  }
+  /// process a block of 4x4 bytes, this is the main part of the XXHash32 algorithm
+  static inline void process(const void* data, uint32_t& state0, uint32_t& state1, uint32_t& state2, uint32_t& state3)
+  {
+    const uint32_t* block = (const uint32_t*) data;
+    state0 = rotateLeft(state0 + block[0] * Prime2, 13) * Prime1;
+    state1 = rotateLeft(state1 + block[1] * Prime2, 13) * Prime1;
+    state2 = rotateLeft(state2 + block[2] * Prime2, 13) * Prime1;
+    state3 = rotateLeft(state3 + block[3] * Prime2, 13) * Prime1;
+  }
+};
diff --git a/src/xxhash64.h b/src/xxhash64.h
new file mode 100644
index 000000000..3b1d6b371
--- /dev/null
+++ b/src/xxhash64.h
@@ -0,0 +1,174 @@
+// //////////////////////////////////////////////////////////
+// xxhash64.h
+// Copyright (c) 2016 Stephan Brumme. All rights reserved.
+// see http://create.stephan-brumme.com/disclaimer.html
+//
+#pragma once
+#include <stdint.h> // for uint32_t and uint64_t
+/// XXHash (64 bit), based on Yann Collet's descriptions, see http://cyan4973.github.io/xxHash/
+/** How to use:
+    uint64_t myseed = 0;
+    XXHash64 myhash(myseed);
+    myhash.add(pointerToSomeBytes,     numberOfBytes);
+    myhash.add(pointerToSomeMoreBytes, numberOfMoreBytes); // call add() as often as you like to ...
+    // and compute hash:
+    uint64_t result = myhash.hash();
+    // or all of the above in one single line:
+    uint64_t result2 = XXHash64::hash(mypointer, numBytes, myseed);
+    Note: my code is NOT endian-aware !
+**/
+class XXHash64
+{
+public:
+  /// create new XXHash (64 bit)
+  /** @param seed your seed value, even zero is a valid seed **/
+  explicit XXHash64(uint64_t seed)
+  {
+    state[0] = seed + Prime1 + Prime2;
+    state[1] = seed + Prime2;
+    state[2] = seed;
+    state[3] = seed - Prime1;
+    bufferSize  = 0;
+    totalLength = 0;
+  }
+  /// add a chunk of bytes
+  /** @param  input  pointer to a continuous block of data
+      @param  length number of bytes
+      @return false if parameters are invalid / zero **/
+  bool add(const void* input, uint64_t length)
+  {
+    // no data ?
+    if (!input || length == 0)
+      return false;
+    totalLength += length;
+    // byte-wise access
+    const unsigned char* data = (const unsigned char*)input;
+    // unprocessed old data plus new data still fit in temporary buffer ?
+    if (bufferSize + length < MaxBufferSize)
+    {
+      // just add new data
+      while (length-- > 0)
+        buffer[bufferSize++] = *data++;
+      return true;
+    }
+    // point beyond last byte
+    const unsigned char* stop      = data + length;
+    const unsigned char* stopBlock = stop - MaxBufferSize;
+    // some data left from previous update ?
+    if (bufferSize > 0)
+    {
+      // make sure temporary buffer is full (16 bytes)
+      while (bufferSize < MaxBufferSize)
+        buffer[bufferSize++] = *data++;
+      // process these 32 bytes (4x8)
+      process(buffer, state[0], state[1], state[2], state[3]);
+    }
+    // copying state to local variables helps optimizer A LOT
+    uint64_t s0 = state[0], s1 = state[1], s2 = state[2], s3 = state[3];
+    // 32 bytes at once
+    while (data <= stopBlock)
+    {
+      // local variables s0..s3 instead of state[0]..state[3] are much faster
+      process(data, s0, s1, s2, s3);
+      data += 32;
+    }
+    // copy back
+    state[0] = s0; state[1] = s1; state[2] = s2; state[3] = s3;
+    // copy remainder to temporary buffer
+    bufferSize = stop - data;
+    for (unsigned int i = 0; i < bufferSize; i++)
+      buffer[i] = data[i];
+    // done
+    return true;
+  }
+  /// get current hash
+  /** @return 64 bit XXHash **/
+  uint64_t hash() const
+  {
+    // fold 256 bit state into one single 64 bit value
+    uint64_t result;
+    if (totalLength >= MaxBufferSize)
+    {
+      result = rotateLeft(state[0],  1) +
+               rotateLeft(state[1],  7) +
+               rotateLeft(state[2], 12) +
+               rotateLeft(state[3], 18);
+      result = (result ^ processSingle(0, state[0])) * Prime1 + Prime4;
+      result = (result ^ processSingle(0, state[1])) * Prime1 + Prime4;
+      result = (result ^ processSingle(0, state[2])) * Prime1 + Prime4;
+      result = (result ^ processSingle(0, state[3])) * Prime1 + Prime4;
+    }
+    else
+    {
+      // internal state wasn't set in add(), therefore original seed is still stored in state2
+      result = state[2] + Prime5;
+    }
+    result += totalLength;
+    // process remaining bytes in temporary buffer
+    const unsigned char* data = buffer;
+    // point beyond last byte
+    const unsigned char* stop = data + bufferSize;
+    // at least 8 bytes left ? => eat 8 bytes per step
+    for (; data + 8 <= stop; data += 8)
+      result = rotateLeft(result ^ processSingle(0, *(uint64_t*)data), 27) * Prime1 + Prime4;
+    // 4 bytes left ? => eat those
+    if (data + 4 <= stop)
+    {
+      result = rotateLeft(result ^ (*(uint32_t*)data) * Prime1,   23) * Prime2 + Prime3;
+      data  += 4;
+    }
+    // take care of remaining 0..3 bytes, eat 1 byte per step
+    while (data != stop)
+      result = rotateLeft(result ^ (*data++) * Prime5,            11) * Prime1;
+    // mix bits
+    result ^= result >> 33;
+    result *= Prime2;
+    result ^= result >> 29;
+    result *= Prime3;
+    result ^= result >> 32;
+    return result;
+  }
+  /// combine constructor, add() and hash() in one static function (C style)
+  /** @param  input  pointer to a continuous block of data
+      @param  length number of bytes
+      @param  seed your seed value, e.g. zero is a valid seed
+      @return 64 bit XXHash **/
+  static uint64_t hash(const void* input, uint64_t length, uint64_t seed)
+  {
+    XXHash64 hasher(seed);
+    hasher.add(input, length);
+      return hasher.hash();
+  }
+private:
+  /// magic constants :-)
+  static const uint64_t Prime1 = 11400714785074694791ULL;
+  static const uint64_t Prime2 = 14029467366897019727ULL;
+  static const uint64_t Prime3 =  1609587929392839161ULL;
+  static const uint64_t Prime4 =  9650029242287828579ULL;
+  static const uint64_t Prime5 =  2870177450012600261ULL;
+  /// temporarily store up to 31 bytes between multiple add() calls
+  static const uint64_t MaxBufferSize = 31+1;
+  uint64_t      state[4];
+  unsigned char buffer[MaxBufferSize];
+  unsigned int  bufferSize;
+  uint64_t      totalLength;
+  /// rotate bits, should compile to a single CPU instruction (ROL)
+  static inline uint64_t rotateLeft(uint64_t x, unsigned char bits)
+  {
+    return (x << bits) | (x >> (64 - bits));
+  }
+  /// process a single 64 bit value
+  static inline uint64_t processSingle(uint64_t previous, uint64_t input)
+  {
+    return rotateLeft(previous + input * Prime2, 31) * Prime1;
+  }
+  /// process a block of 4x4 bytes, this is the main part of the XXHash32 algorithm
+  static inline void process(const void* data, uint64_t& state0, uint64_t& state1, uint64_t& state2, uint64_t& state3)
+  {
+    const uint64_t* block = (const uint64_t*) data;
+    state0 = processSingle(state0, block[0]);
+    state1 = processSingle(state1, block[1]);
+    state2 = processSingle(state2, block[2]);
+    state3 = processSingle(state3, block[3]);
+  }
+};