From d666fa170ce7211ca05c72fa3a9c3e0f55deb892 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Thu, 26 Dec 2024 15:19:14 -0500 Subject: [PATCH] perf(html5): use a hashmap to detect duplicate attributes If there are more than 16 attributes, shift from doing strcmp to using a hashmap for duplicate detection. The number 16 was chosen based on the benchmark in sparklemotion/nokogiri#2568 I've introduced @tidwall's hashmap.c (MIT licensed and the copyright appropriately copied in the LICENSE-DEPENDENCIES file) to have something self-contained within the libgumbo codebase, rather than using libxml2's xmlHash or ruby's st.c. --- LICENSE-DEPENDENCIES.md | 29 + gumbo-parser/src/Makefile | 2 + gumbo-parser/src/hashmap.c | 1154 ++++++++++++++++++++++++++++++++ gumbo-parser/src/hashmap.h | 62 ++ gumbo-parser/src/string_set.c | 53 ++ gumbo-parser/src/string_set.h | 22 + gumbo-parser/src/tokenizer.c | 68 +- gumbo-parser/test/tokenizer.cc | 36 + nokogiri.gemspec | 6 +- test/html5/test_attributes.rb | 12 + 10 files changed, 1424 insertions(+), 20 deletions(-) create mode 100644 gumbo-parser/src/hashmap.c create mode 100644 gumbo-parser/src/hashmap.h create mode 100644 gumbo-parser/src/string_set.c create mode 100644 gumbo-parser/src/string_set.h diff --git a/LICENSE-DEPENDENCIES.md b/LICENSE-DEPENDENCIES.md index 1e950b6e37e..649027f3699 100644 --- a/LICENSE-DEPENDENCIES.md +++ b/LICENSE-DEPENDENCIES.md @@ -15,6 +15,7 @@ Note that this document is broken into multiple sections, each of which describe * [Native WindowsⓇ platform releases ("x64-mingw-ucrt")](#native-windows%E2%93%A1-platform-releases-x64-mingw-ucrt) * [JavaⓇ (JRuby) platform release ("java")](#java%E2%93%A1-jruby-platform-release-java) - [Appendix: Dependencies' License Texts](#appendix-dependencies-license-texts) + * [hashmap.c](#hashmapc) * [libgumbo](#libgumbo) * [libxml2](#libxml2) * [libxslt](#libxslt) @@ -112,6 +113,34 @@ This section contains a subsection for each potentially-distributed dependency, Please see previous sections to understand which of these potential dependencies is actually distributed in the gem file you're downloading and using. +### hashmap.c + +MIT + +https://github.com/tidwall/hashmap.c/blob/master/LICENSE + + The MIT License (MIT) + + Copyright (c) 2020 Joshua J Baker + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + the Software, and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ### libgumbo Apache 2.0 diff --git a/gumbo-parser/src/Makefile b/gumbo-parser/src/Makefile index 6bd4a18fbed..db58c3137f5 100644 --- a/gumbo-parser/src/Makefile +++ b/gumbo-parser/src/Makefile @@ -13,9 +13,11 @@ gumbo_objs := \ char_ref.o \ error.o \ foreign_attrs.o \ + hashmap.o \ parser.o \ string_buffer.o \ string_piece.o \ + string_set.o \ svg_attrs.o \ svg_tags.o \ tag.o \ diff --git a/gumbo-parser/src/hashmap.c b/gumbo-parser/src/hashmap.c new file mode 100644 index 00000000000..d70b16277ec --- /dev/null +++ b/gumbo-parser/src/hashmap.c @@ -0,0 +1,1154 @@ +// Copyright 2020 Joshua J Baker. All rights reserved. +// Use of this source code is governed by an MIT-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include +#include "hashmap.h" + +#define GROW_AT 0.60 /* 60% */ +#define SHRINK_AT 0.10 /* 10% */ + +#ifndef HASHMAP_LOAD_FACTOR +#define HASHMAP_LOAD_FACTOR GROW_AT +#endif + +static void *(*__malloc)(size_t) = NULL; +static void *(*__realloc)(void *, size_t) = NULL; +static void (*__free)(void *) = NULL; + +// hashmap_set_allocator allows for configuring a custom allocator for +// all hashmap library operations. This function, if needed, should be called +// only once at startup and a prior to calling hashmap_new(). +void hashmap_set_allocator(void *(*malloc)(size_t), void (*free)(void*)) { + __malloc = malloc; + __free = free; +} + +struct bucket { + uint64_t hash:48; + uint64_t dib:16; +}; + +// hashmap is an open addressed hash map using robinhood hashing. +struct hashmap { + void *(*malloc)(size_t); + void *(*realloc)(void *, size_t); + void (*free)(void *); + size_t elsize; + size_t cap; + uint64_t seed0; + uint64_t seed1; + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1); + int (*compare)(const void *a, const void *b, void *udata); + void (*elfree)(void *item); + void *udata; + size_t bucketsz; + size_t nbuckets; + size_t count; + size_t mask; + size_t growat; + size_t shrinkat; + uint8_t loadfactor; + uint8_t growpower; + bool oom; + void *buckets; + void *spare; + void *edata; +}; + +void hashmap_set_grow_by_power(struct hashmap *map, size_t power) { + map->growpower = power < 1 ? 1 : power > 16 ? 16 : power; +} + +static double clamp_load_factor(double factor, double default_factor) { + // Check for NaN and clamp between 50% and 90% + return factor != factor ? default_factor : + factor < 0.50 ? 0.50 : + factor > 0.95 ? 0.95 : + factor; +} + +void hashmap_set_load_factor(struct hashmap *map, double factor) { + factor = clamp_load_factor(factor, map->loadfactor / 100.0); + map->loadfactor = factor * 100; + map->growat = map->nbuckets * (map->loadfactor / 100.0); +} + +static struct bucket *bucket_at0(void *buckets, size_t bucketsz, size_t i) { + return (struct bucket*)(((char*)buckets)+(bucketsz*i)); +} + +static struct bucket *bucket_at(struct hashmap *map, size_t index) { + return bucket_at0(map->buckets, map->bucketsz, index); +} + +static void *bucket_item(struct bucket *entry) { + return ((char*)entry)+sizeof(struct bucket); +} + +static uint64_t clip_hash(uint64_t hash) { + return hash & 0xFFFFFFFFFFFF; +} + +static uint64_t get_hash(struct hashmap *map, const void *key) { + return clip_hash(map->hash(key, map->seed0, map->seed1)); +} + + +// hashmap_new_with_allocator returns a new hash map using a custom allocator. +// See hashmap_new for more information information +struct hashmap *hashmap_new_with_allocator(void *(*_malloc)(size_t), + void *(*_realloc)(void*, size_t), void (*_free)(void*), + size_t elsize, size_t cap, uint64_t seed0, uint64_t seed1, + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1), + int (*compare)(const void *a, const void *b, void *udata), + void (*elfree)(void *item), + void *udata) +{ + _malloc = _malloc ? _malloc : __malloc ? __malloc : malloc; + _realloc = _realloc ? _realloc : __realloc ? __realloc : realloc; + _free = _free ? _free : __free ? __free : free; + size_t ncap = 16; + if (cap < ncap) { + cap = ncap; + } else { + while (ncap < cap) { + ncap *= 2; + } + cap = ncap; + } + size_t bucketsz = sizeof(struct bucket) + elsize; + while (bucketsz & (sizeof(uintptr_t)-1)) { + bucketsz++; + } + // hashmap + spare + edata + size_t size = sizeof(struct hashmap)+bucketsz*2; + struct hashmap *map = _malloc(size); + if (!map) { + return NULL; + } + memset(map, 0, sizeof(struct hashmap)); + map->elsize = elsize; + map->bucketsz = bucketsz; + map->seed0 = seed0; + map->seed1 = seed1; + map->hash = hash; + map->compare = compare; + map->elfree = elfree; + map->udata = udata; + map->spare = ((char*)map)+sizeof(struct hashmap); + map->edata = (char*)map->spare+bucketsz; + map->cap = cap; + map->nbuckets = cap; + map->mask = map->nbuckets-1; + map->buckets = _malloc(map->bucketsz*map->nbuckets); + if (!map->buckets) { + _free(map); + return NULL; + } + memset(map->buckets, 0, map->bucketsz*map->nbuckets); + map->growpower = 1; + map->loadfactor = clamp_load_factor(HASHMAP_LOAD_FACTOR, GROW_AT) * 100; + map->growat = map->nbuckets * (map->loadfactor / 100.0); + map->shrinkat = map->nbuckets * SHRINK_AT; + map->malloc = _malloc; + map->realloc = _realloc; + map->free = _free; + return map; +} + +// hashmap_new returns a new hash map. +// Param `elsize` is the size of each element in the tree. Every element that +// is inserted, deleted, or retrieved will be this size. +// Param `cap` is the default lower capacity of the hashmap. Setting this to +// zero will default to 16. +// Params `seed0` and `seed1` are optional seed values that are passed to the +// following `hash` function. These can be any value you wish but it's often +// best to use randomly generated values. +// Param `hash` is a function that generates a hash value for an item. It's +// important that you provide a good hash function, otherwise it will perform +// poorly or be vulnerable to Denial-of-service attacks. This implementation +// comes with two helper functions `hashmap_sip()` and `hashmap_murmur()`. +// Param `compare` is a function that compares items in the tree. See the +// qsort stdlib function for an example of how this function works. +// The hashmap must be freed with hashmap_free(). +// Param `elfree` is a function that frees a specific item. This should be NULL +// unless you're storing some kind of reference data in the hash. +struct hashmap *hashmap_new(size_t elsize, size_t cap, uint64_t seed0, + uint64_t seed1, + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1), + int (*compare)(const void *a, const void *b, void *udata), + void (*elfree)(void *item), + void *udata) +{ + return hashmap_new_with_allocator(NULL, NULL, NULL, elsize, cap, seed0, + seed1, hash, compare, elfree, udata); +} + +static void free_elements(struct hashmap *map) { + if (map->elfree) { + for (size_t i = 0; i < map->nbuckets; i++) { + struct bucket *bucket = bucket_at(map, i); + if (bucket->dib) map->elfree(bucket_item(bucket)); + } + } +} + +// hashmap_clear quickly clears the map. +// Every item is called with the element-freeing function given in hashmap_new, +// if present, to free any data referenced in the elements of the hashmap. +// When the update_cap is provided, the map's capacity will be updated to match +// the currently number of allocated buckets. This is an optimization to ensure +// that this operation does not perform any allocations. +void hashmap_clear(struct hashmap *map, bool update_cap) { + map->count = 0; + free_elements(map); + if (update_cap) { + map->cap = map->nbuckets; + } else if (map->nbuckets != map->cap) { + void *new_buckets = map->malloc(map->bucketsz*map->cap); + if (new_buckets) { + map->free(map->buckets); + map->buckets = new_buckets; + } + map->nbuckets = map->cap; + } + memset(map->buckets, 0, map->bucketsz*map->nbuckets); + map->mask = map->nbuckets-1; + map->growat = map->nbuckets * (map->loadfactor / 100.0) ; + map->shrinkat = map->nbuckets * SHRINK_AT; +} + +static bool resize0(struct hashmap *map, size_t new_cap) { + struct hashmap *map2 = hashmap_new_with_allocator(map->malloc, map->realloc, + map->free, map->elsize, new_cap, map->seed0, map->seed1, map->hash, + map->compare, map->elfree, map->udata); + if (!map2) return false; + for (size_t i = 0; i < map->nbuckets; i++) { + struct bucket *entry = bucket_at(map, i); + if (!entry->dib) { + continue; + } + entry->dib = 1; + size_t j = entry->hash & map2->mask; + while(1) { + struct bucket *bucket = bucket_at(map2, j); + if (bucket->dib == 0) { + memcpy(bucket, entry, map->bucketsz); + break; + } + if (bucket->dib < entry->dib) { + memcpy(map2->spare, bucket, map->bucketsz); + memcpy(bucket, entry, map->bucketsz); + memcpy(entry, map2->spare, map->bucketsz); + } + j = (j + 1) & map2->mask; + entry->dib += 1; + } + } + map->free(map->buckets); + map->buckets = map2->buckets; + map->nbuckets = map2->nbuckets; + map->mask = map2->mask; + map->growat = map2->growat; + map->shrinkat = map2->shrinkat; + map->free(map2); + return true; +} + +static bool resize(struct hashmap *map, size_t new_cap) { + return resize0(map, new_cap); +} + +// hashmap_set_with_hash works like hashmap_set but you provide your +// own hash. The 'hash' callback provided to the hashmap_new function +// will not be called +const void *hashmap_set_with_hash(struct hashmap *map, const void *item, + uint64_t hash) +{ + hash = clip_hash(hash); + map->oom = false; + if (map->count >= map->growat) { + if (!resize(map, map->nbuckets*(1<growpower))) { + map->oom = true; + return NULL; + } + } + + struct bucket *entry = map->edata; + entry->hash = hash; + entry->dib = 1; + void *eitem = bucket_item(entry); + memcpy(eitem, item, map->elsize); + + void *bitem; + size_t i = entry->hash & map->mask; + while(1) { + struct bucket *bucket = bucket_at(map, i); + if (bucket->dib == 0) { + memcpy(bucket, entry, map->bucketsz); + map->count++; + return NULL; + } + bitem = bucket_item(bucket); + if (entry->hash == bucket->hash && (!map->compare || + map->compare(eitem, bitem, map->udata) == 0)) + { + memcpy(map->spare, bitem, map->elsize); + memcpy(bitem, eitem, map->elsize); + return map->spare; + } + if (bucket->dib < entry->dib) { + memcpy(map->spare, bucket, map->bucketsz); + memcpy(bucket, entry, map->bucketsz); + memcpy(entry, map->spare, map->bucketsz); + eitem = bucket_item(entry); + } + i = (i + 1) & map->mask; + entry->dib += 1; + } +} + +// hashmap_set inserts or replaces an item in the hash map. If an item is +// replaced then it is returned otherwise NULL is returned. This operation +// may allocate memory. If the system is unable to allocate additional +// memory then NULL is returned and hashmap_oom() returns true. +const void *hashmap_set(struct hashmap *map, const void *item) { + return hashmap_set_with_hash(map, item, get_hash(map, item)); +} + +// hashmap_get_with_hash works like hashmap_get but you provide your +// own hash. The 'hash' callback provided to the hashmap_new function +// will not be called +const void *hashmap_get_with_hash(struct hashmap *map, const void *key, + uint64_t hash) +{ + hash = clip_hash(hash); + size_t i = hash & map->mask; + while(1) { + struct bucket *bucket = bucket_at(map, i); + if (!bucket->dib) return NULL; + if (bucket->hash == hash) { + void *bitem = bucket_item(bucket); + if (!map->compare || map->compare(key, bitem, map->udata) == 0) { + return bitem; + } + } + i = (i + 1) & map->mask; + } +} + +// hashmap_get returns the item based on the provided key. If the item is not +// found then NULL is returned. +const void *hashmap_get(struct hashmap *map, const void *key) { + return hashmap_get_with_hash(map, key, get_hash(map, key)); +} + +// hashmap_probe returns the item in the bucket at position or NULL if an item +// is not set for that bucket. The position is 'moduloed' by the number of +// buckets in the hashmap. +const void *hashmap_probe(struct hashmap *map, uint64_t position) { + size_t i = position & map->mask; + struct bucket *bucket = bucket_at(map, i); + if (!bucket->dib) { + return NULL; + } + return bucket_item(bucket); +} + +// hashmap_delete_with_hash works like hashmap_delete but you provide your +// own hash. The 'hash' callback provided to the hashmap_new function +// will not be called +const void *hashmap_delete_with_hash(struct hashmap *map, const void *key, + uint64_t hash) +{ + hash = clip_hash(hash); + map->oom = false; + size_t i = hash & map->mask; + while(1) { + struct bucket *bucket = bucket_at(map, i); + if (!bucket->dib) { + return NULL; + } + void *bitem = bucket_item(bucket); + if (bucket->hash == hash && (!map->compare || + map->compare(key, bitem, map->udata) == 0)) + { + memcpy(map->spare, bitem, map->elsize); + bucket->dib = 0; + while(1) { + struct bucket *prev = bucket; + i = (i + 1) & map->mask; + bucket = bucket_at(map, i); + if (bucket->dib <= 1) { + prev->dib = 0; + break; + } + memcpy(prev, bucket, map->bucketsz); + prev->dib--; + } + map->count--; + if (map->nbuckets > map->cap && map->count <= map->shrinkat) { + // Ignore the return value. It's ok for the resize operation to + // fail to allocate enough memory because a shrink operation + // does not change the integrity of the data. + resize(map, map->nbuckets/2); + } + return map->spare; + } + i = (i + 1) & map->mask; + } +} + +// hashmap_delete removes an item from the hash map and returns it. If the +// item is not found then NULL is returned. +const void *hashmap_delete(struct hashmap *map, const void *key) { + return hashmap_delete_with_hash(map, key, get_hash(map, key)); +} + +// hashmap_count returns the number of items in the hash map. +size_t hashmap_count(struct hashmap *map) { + return map->count; +} + +// hashmap_free frees the hash map +// Every item is called with the element-freeing function given in hashmap_new, +// if present, to free any data referenced in the elements of the hashmap. +void hashmap_free(struct hashmap *map) { + if (!map) return; + free_elements(map); + map->free(map->buckets); + map->free(map); +} + +// hashmap_oom returns true if the last hashmap_set() call failed due to the +// system being out of memory. +bool hashmap_oom(struct hashmap *map) { + return map->oom; +} + +// hashmap_scan iterates over all items in the hash map +// Param `iter` can return false to stop iteration early. +// Returns false if the iteration has been stopped early. +bool hashmap_scan(struct hashmap *map, + bool (*iter)(const void *item, void *udata), void *udata) +{ + for (size_t i = 0; i < map->nbuckets; i++) { + struct bucket *bucket = bucket_at(map, i); + if (bucket->dib && !iter(bucket_item(bucket), udata)) { + return false; + } + } + return true; +} + +// hashmap_iter iterates one key at a time yielding a reference to an +// entry at each iteration. Useful to write simple loops and avoid writing +// dedicated callbacks and udata structures, as in hashmap_scan. +// +// map is a hash map handle. i is a pointer to a size_t cursor that +// should be initialized to 0 at the beginning of the loop. item is a void +// pointer pointer that is populated with the retrieved item. Note that this +// is NOT a copy of the item stored in the hash map and can be directly +// modified. +// +// Note that if hashmap_delete() is called on the hashmap being iterated, +// the buckets are rearranged and the iterator must be reset to 0, otherwise +// unexpected results may be returned after deletion. +// +// This function has not been tested for thread safety. +// +// The function returns true if an item was retrieved; false if the end of the +// iteration has been reached. +bool hashmap_iter(struct hashmap *map, size_t *i, void **item) { + struct bucket *bucket; + do { + if (*i >= map->nbuckets) return false; + bucket = bucket_at(map, *i); + (*i)++; + } while (!bucket->dib); + *item = bucket_item(bucket); + return true; +} + + +//----------------------------------------------------------------------------- +// SipHash reference C implementation +// +// Copyright (c) 2012-2016 Jean-Philippe Aumasson +// +// Copyright (c) 2012-2014 Daniel J. Bernstein +// +// To the extent possible under law, the author(s) have dedicated all copyright +// and related and neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// +// You should have received a copy of the CC0 Public Domain Dedication along +// with this software. If not, see +// . +// +// default: SipHash-2-4 +//----------------------------------------------------------------------------- +static uint64_t SIP64(const uint8_t *in, const size_t inlen, uint64_t seed0, + uint64_t seed1) +{ +#define U8TO64_LE(p) \ + { (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \ + ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \ + ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \ + ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56)) } +#define U64TO8_LE(p, v) \ + { U32TO8_LE((p), (uint32_t)((v))); \ + U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); } +#define U32TO8_LE(p, v) \ + { (p)[0] = (uint8_t)((v)); \ + (p)[1] = (uint8_t)((v) >> 8); \ + (p)[2] = (uint8_t)((v) >> 16); \ + (p)[3] = (uint8_t)((v) >> 24); } +#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b)))) +#define SIPROUND \ + { v0 += v1; v1 = ROTL(v1, 13); \ + v1 ^= v0; v0 = ROTL(v0, 32); \ + v2 += v3; v3 = ROTL(v3, 16); \ + v3 ^= v2; \ + v0 += v3; v3 = ROTL(v3, 21); \ + v3 ^= v0; \ + v2 += v1; v1 = ROTL(v1, 17); \ + v1 ^= v2; v2 = ROTL(v2, 32); } + uint64_t k0 = U8TO64_LE((uint8_t*)&seed0); + uint64_t k1 = U8TO64_LE((uint8_t*)&seed1); + uint64_t v3 = UINT64_C(0x7465646279746573) ^ k1; + uint64_t v2 = UINT64_C(0x6c7967656e657261) ^ k0; + uint64_t v1 = UINT64_C(0x646f72616e646f6d) ^ k1; + uint64_t v0 = UINT64_C(0x736f6d6570736575) ^ k0; + const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t)); + for (; in != end; in += 8) { + uint64_t m = U8TO64_LE(in); + v3 ^= m; + SIPROUND; SIPROUND; + v0 ^= m; + } + const int left = inlen & 7; + uint64_t b = ((uint64_t)inlen) << 56; + switch (left) { + case 7: b |= ((uint64_t)in[6]) << 48; /* fall through */ + case 6: b |= ((uint64_t)in[5]) << 40; /* fall through */ + case 5: b |= ((uint64_t)in[4]) << 32; /* fall through */ + case 4: b |= ((uint64_t)in[3]) << 24; /* fall through */ + case 3: b |= ((uint64_t)in[2]) << 16; /* fall through */ + case 2: b |= ((uint64_t)in[1]) << 8; /* fall through */ + case 1: b |= ((uint64_t)in[0]); break; + case 0: break; + } + v3 ^= b; + SIPROUND; SIPROUND; + v0 ^= b; + v2 ^= 0xff; + SIPROUND; SIPROUND; SIPROUND; SIPROUND; + b = v0 ^ v1 ^ v2 ^ v3; + uint64_t out = 0; + U64TO8_LE((uint8_t*)&out, b); + return out; +} + +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. +// +// Murmur3_86_128 +//----------------------------------------------------------------------------- +static uint64_t MM86128(const void *key, const int len, uint32_t seed) { +#define ROTL32(x, r) ((x << r) | (x >> (32 - r))) +#define FMIX32(h) h^=h>>16; h*=0x85ebca6b; h^=h>>13; h*=0xc2b2ae35; h^=h>>16; + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + for (int i = -nblocks; i; i++) { + uint32_t k1 = blocks[i*4+0]; + uint32_t k2 = blocks[i*4+1]; + uint32_t k3 = blocks[i*4+2]; + uint32_t k4 = blocks[i*4+3]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + switch(len & 15) { + case 15: k4 ^= tail[14] << 16; /* fall through */ + case 14: k4 ^= tail[13] << 8; /* fall through */ + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + /* fall through */ + case 12: k3 ^= tail[11] << 24; /* fall through */ + case 11: k3 ^= tail[10] << 16; /* fall through */ + case 10: k3 ^= tail[ 9] << 8; /* fall through */ + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + /* fall through */ + case 8: k2 ^= tail[ 7] << 24; /* fall through */ + case 7: k2 ^= tail[ 6] << 16; /* fall through */ + case 6: k2 ^= tail[ 5] << 8; /* fall through */ + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + /* fall through */ + case 4: k1 ^= tail[ 3] << 24; /* fall through */ + case 3: k1 ^= tail[ 2] << 16; /* fall through */ + case 2: k1 ^= tail[ 1] << 8; /* fall through */ + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + /* fall through */ + }; + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + FMIX32(h1); FMIX32(h2); FMIX32(h3); FMIX32(h4); + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + return (((uint64_t)h2)<<32)|h1; +} + +//----------------------------------------------------------------------------- +// xxHash Library +// Copyright (c) 2012-2021 Yann Collet +// All rights reserved. +// +// BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) +// +// xxHash3 +//----------------------------------------------------------------------------- +#define XXH_PRIME_1 11400714785074694791ULL +#define XXH_PRIME_2 14029467366897019727ULL +#define XXH_PRIME_3 1609587929392839161ULL +#define XXH_PRIME_4 9650029242287828579ULL +#define XXH_PRIME_5 2870177450012600261ULL + +static uint64_t XXH_read64(const void* memptr) { + uint64_t val; + memcpy(&val, memptr, sizeof(val)); + return val; +} + +static uint32_t XXH_read32(const void* memptr) { + uint32_t val; + memcpy(&val, memptr, sizeof(val)); + return val; +} + +static uint64_t XXH_rotl64(uint64_t x, int r) { + return (x << r) | (x >> (64 - r)); +} + +static uint64_t xxh3(const void* data, size_t len, uint64_t seed) { + const uint8_t* p = (const uint8_t*)data; + const uint8_t* const end = p + len; + uint64_t h64; + + if (len >= 32) { + const uint8_t* const limit = end - 32; + uint64_t v1 = seed + XXH_PRIME_1 + XXH_PRIME_2; + uint64_t v2 = seed + XXH_PRIME_2; + uint64_t v3 = seed + 0; + uint64_t v4 = seed - XXH_PRIME_1; + + do { + v1 += XXH_read64(p) * XXH_PRIME_2; + v1 = XXH_rotl64(v1, 31); + v1 *= XXH_PRIME_1; + + v2 += XXH_read64(p + 8) * XXH_PRIME_2; + v2 = XXH_rotl64(v2, 31); + v2 *= XXH_PRIME_1; + + v3 += XXH_read64(p + 16) * XXH_PRIME_2; + v3 = XXH_rotl64(v3, 31); + v3 *= XXH_PRIME_1; + + v4 += XXH_read64(p + 24) * XXH_PRIME_2; + v4 = XXH_rotl64(v4, 31); + v4 *= XXH_PRIME_1; + + p += 32; + } while (p <= limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + + XXH_rotl64(v4, 18); + + v1 *= XXH_PRIME_2; + v1 = XXH_rotl64(v1, 31); + v1 *= XXH_PRIME_1; + h64 ^= v1; + h64 = h64 * XXH_PRIME_1 + XXH_PRIME_4; + + v2 *= XXH_PRIME_2; + v2 = XXH_rotl64(v2, 31); + v2 *= XXH_PRIME_1; + h64 ^= v2; + h64 = h64 * XXH_PRIME_1 + XXH_PRIME_4; + + v3 *= XXH_PRIME_2; + v3 = XXH_rotl64(v3, 31); + v3 *= XXH_PRIME_1; + h64 ^= v3; + h64 = h64 * XXH_PRIME_1 + XXH_PRIME_4; + + v4 *= XXH_PRIME_2; + v4 = XXH_rotl64(v4, 31); + v4 *= XXH_PRIME_1; + h64 ^= v4; + h64 = h64 * XXH_PRIME_1 + XXH_PRIME_4; + } + else { + h64 = seed + XXH_PRIME_5; + } + + h64 += (uint64_t)len; + + while (p + 8 <= end) { + uint64_t k1 = XXH_read64(p); + k1 *= XXH_PRIME_2; + k1 = XXH_rotl64(k1, 31); + k1 *= XXH_PRIME_1; + h64 ^= k1; + h64 = XXH_rotl64(h64, 27) * XXH_PRIME_1 + XXH_PRIME_4; + p += 8; + } + + if (p + 4 <= end) { + h64 ^= (uint64_t)(XXH_read32(p)) * XXH_PRIME_1; + h64 = XXH_rotl64(h64, 23) * XXH_PRIME_2 + XXH_PRIME_3; + p += 4; + } + + while (p < end) { + h64 ^= (*p) * XXH_PRIME_5; + h64 = XXH_rotl64(h64, 11) * XXH_PRIME_1; + p++; + } + + h64 ^= h64 >> 33; + h64 *= XXH_PRIME_2; + h64 ^= h64 >> 29; + h64 *= XXH_PRIME_3; + h64 ^= h64 >> 32; + + return h64; +} + +// hashmap_sip returns a hash value for `data` using SipHash-2-4. +uint64_t hashmap_sip(const void *data, size_t len, uint64_t seed0, + uint64_t seed1) +{ + return SIP64((uint8_t*)data, len, seed0, seed1); +} + +// hashmap_murmur returns a hash value for `data` using Murmur3_86_128. +uint64_t hashmap_murmur(const void *data, size_t len, uint64_t seed0, + uint64_t seed1) +{ + (void)seed1; + return MM86128(data, len, seed0); +} + +uint64_t hashmap_xxhash3(const void *data, size_t len, uint64_t seed0, + uint64_t seed1) +{ + (void)seed1; + return xxh3(data, len ,seed0); +} + +//============================================================================== +// TESTS AND BENCHMARKS +// $ cc -DHASHMAP_TEST hashmap.c && ./a.out # run tests +// $ cc -DHASHMAP_TEST -O3 hashmap.c && BENCH=1 ./a.out # run benchmarks +//============================================================================== +#ifdef HASHMAP_TEST + +static size_t deepcount(struct hashmap *map) { + size_t count = 0; + for (size_t i = 0; i < map->nbuckets; i++) { + if (bucket_at(map, i)->dib) { + count++; + } + } + return count; +} + +#ifdef __GNUC__ +#pragma GCC diagnostic ignored "-Wpedantic" +#endif +#ifdef __clang__ +#pragma GCC diagnostic ignored "-Wunknown-warning-option" +#pragma GCC diagnostic ignored "-Wcompound-token-split-by-macro" +#pragma GCC diagnostic ignored "-Wgnu-statement-expression-from-macro-expansion" +#endif +#ifdef __GNUC__ +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +#include +#include +#include +#include +#include +#include "hashmap.h" + +static bool rand_alloc_fail = false; +static int rand_alloc_fail_odds = 3; // 1 in 3 chance malloc will fail. +static uintptr_t total_allocs = 0; +static uintptr_t total_mem = 0; + +static void *xmalloc(size_t size) { + if (rand_alloc_fail && rand()%rand_alloc_fail_odds == 0) { + return NULL; + } + void *mem = malloc(sizeof(uintptr_t)+size); + assert(mem); + *(uintptr_t*)mem = size; + total_allocs++; + total_mem += size; + return (char*)mem+sizeof(uintptr_t); +} + +static void xfree(void *ptr) { + if (ptr) { + total_mem -= *(uintptr_t*)((char*)ptr-sizeof(uintptr_t)); + free((char*)ptr-sizeof(uintptr_t)); + total_allocs--; + } +} + +static void shuffle(void *array, size_t numels, size_t elsize) { + char tmp[elsize]; + char *arr = array; + for (size_t i = 0; i < numels - 1; i++) { + int j = i + rand() / (RAND_MAX / (numels - i) + 1); + memcpy(tmp, arr + j * elsize, elsize); + memcpy(arr + j * elsize, arr + i * elsize, elsize); + memcpy(arr + i * elsize, tmp, elsize); + } +} + +static bool iter_ints(const void *item, void *udata) { + int *vals = *(int**)udata; + vals[*(int*)item] = 1; + return true; +} + +static int compare_ints_udata(const void *a, const void *b, void *udata) { + return *(int*)a - *(int*)b; +} + +static int compare_strs(const void *a, const void *b, void *udata) { + return strcmp(*(char**)a, *(char**)b); +} + +static uint64_t hash_int(const void *item, uint64_t seed0, uint64_t seed1) { + return hashmap_xxhash3(item, sizeof(int), seed0, seed1); + // return hashmap_sip(item, sizeof(int), seed0, seed1); + // return hashmap_murmur(item, sizeof(int), seed0, seed1); +} + +static uint64_t hash_str(const void *item, uint64_t seed0, uint64_t seed1) { + return hashmap_xxhash3(*(char**)item, strlen(*(char**)item), seed0, seed1); + // return hashmap_sip(*(char**)item, strlen(*(char**)item), seed0, seed1); + // return hashmap_murmur(*(char**)item, strlen(*(char**)item), seed0, seed1); +} + +static void free_str(void *item) { + xfree(*(char**)item); +} + +static void all(void) { + int seed = getenv("SEED")?atoi(getenv("SEED")):time(NULL); + int N = getenv("N")?atoi(getenv("N")):2000; + printf("seed=%d, count=%d, item_size=%zu\n", seed, N, sizeof(int)); + srand(seed); + + rand_alloc_fail = true; + + // test sip and murmur hashes + assert(hashmap_sip("hello", 5, 1, 2) == 2957200328589801622); + assert(hashmap_murmur("hello", 5, 1, 2) == 1682575153221130884); + assert(hashmap_xxhash3("hello", 5, 1, 2) == 2584346877953614258); + + int *vals; + while (!(vals = xmalloc(N * sizeof(int)))) {} + for (int i = 0; i < N; i++) { + vals[i] = i; + } + + struct hashmap *map; + + while (!(map = hashmap_new(sizeof(int), 0, seed, seed, + hash_int, compare_ints_udata, NULL, NULL))) {} + shuffle(vals, N, sizeof(int)); + for (int i = 0; i < N; i++) { + // // printf("== %d ==\n", vals[i]); + assert(map->count == (size_t)i); + assert(map->count == hashmap_count(map)); + assert(map->count == deepcount(map)); + const int *v; + assert(!hashmap_get(map, &vals[i])); + assert(!hashmap_delete(map, &vals[i])); + while (true) { + assert(!hashmap_set(map, &vals[i])); + if (!hashmap_oom(map)) { + break; + } + } + + for (int j = 0; j < i; j++) { + v = hashmap_get(map, &vals[j]); + assert(v && *v == vals[j]); + } + while (true) { + v = hashmap_set(map, &vals[i]); + if (!v) { + assert(hashmap_oom(map)); + continue; + } else { + assert(!hashmap_oom(map)); + assert(v && *v == vals[i]); + break; + } + } + v = hashmap_get(map, &vals[i]); + assert(v && *v == vals[i]); + v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + assert(!hashmap_get(map, &vals[i])); + assert(!hashmap_delete(map, &vals[i])); + assert(!hashmap_set(map, &vals[i])); + assert(map->count == (size_t)(i+1)); + assert(map->count == hashmap_count(map)); + assert(map->count == deepcount(map)); + } + + int *vals2; + while (!(vals2 = xmalloc(N * sizeof(int)))) {} + memset(vals2, 0, N * sizeof(int)); + assert(hashmap_scan(map, iter_ints, &vals2)); + + // Test hashmap_iter. This does the same as hashmap_scan above. + size_t iter = 0; + void *iter_val; + while (hashmap_iter (map, &iter, &iter_val)) { + assert (iter_ints(iter_val, &vals2)); + } + for (int i = 0; i < N; i++) { + assert(vals2[i] == 1); + } + xfree(vals2); + + shuffle(vals, N, sizeof(int)); + for (int i = 0; i < N; i++) { + const int *v; + v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + assert(!hashmap_get(map, &vals[i])); + assert(map->count == (size_t)(N-i-1)); + assert(map->count == hashmap_count(map)); + assert(map->count == deepcount(map)); + for (int j = N-1; j > i; j--) { + v = hashmap_get(map, &vals[j]); + assert(v && *v == vals[j]); + } + } + + for (int i = 0; i < N; i++) { + while (true) { + assert(!hashmap_set(map, &vals[i])); + if (!hashmap_oom(map)) { + break; + } + } + } + + assert(map->count != 0); + size_t prev_cap = map->cap; + hashmap_clear(map, true); + assert(prev_cap < map->cap); + assert(map->count == 0); + + + for (int i = 0; i < N; i++) { + while (true) { + assert(!hashmap_set(map, &vals[i])); + if (!hashmap_oom(map)) { + break; + } + } + } + + prev_cap = map->cap; + hashmap_clear(map, false); + assert(prev_cap == map->cap); + + hashmap_free(map); + + xfree(vals); + + + while (!(map = hashmap_new(sizeof(char*), 0, seed, seed, + hash_str, compare_strs, free_str, NULL))); + + for (int i = 0; i < N; i++) { + char *str; + while (!(str = xmalloc(16))); + snprintf(str, 16, "s%i", i); + while(!hashmap_set(map, &str)); + } + + hashmap_clear(map, false); + assert(hashmap_count(map) == 0); + + for (int i = 0; i < N; i++) { + char *str; + while (!(str = xmalloc(16))); + snprintf(str, 16, "s%i", i); + while(!hashmap_set(map, &str)); + } + + hashmap_free(map); + + if (total_allocs != 0) { + fprintf(stderr, "total_allocs: expected 0, got %lu\n", total_allocs); + exit(1); + } +} + +#define bench(name, N, code) {{ \ + if (strlen(name) > 0) { \ + printf("%-14s ", name); \ + } \ + size_t tmem = total_mem; \ + size_t tallocs = total_allocs; \ + uint64_t bytes = 0; \ + clock_t begin = clock(); \ + for (int i = 0; i < N; i++) { \ + (code); \ + } \ + clock_t end = clock(); \ + double elapsed_secs = (double)(end - begin) / CLOCKS_PER_SEC; \ + double bytes_sec = (double)bytes/elapsed_secs; \ + printf("%d ops in %.3f secs, %.0f ns/op, %.0f op/sec", \ + N, elapsed_secs, \ + elapsed_secs/(double)N*1e9, \ + (double)N/elapsed_secs \ + ); \ + if (bytes > 0) { \ + printf(", %.1f GB/sec", bytes_sec/1024/1024/1024); \ + } \ + if (total_mem > tmem) { \ + size_t used_mem = total_mem-tmem; \ + printf(", %.2f bytes/op", (double)used_mem/N); \ + } \ + if (total_allocs > tallocs) { \ + size_t used_allocs = total_allocs-tallocs; \ + printf(", %.2f allocs/op", (double)used_allocs/N); \ + } \ + printf("\n"); \ +}} + +static void benchmarks(void) { + int seed = getenv("SEED")?atoi(getenv("SEED")):time(NULL); + int N = getenv("N")?atoi(getenv("N")):5000000; + printf("seed=%d, count=%d, item_size=%zu\n", seed, N, sizeof(int)); + srand(seed); + + + int *vals = xmalloc(N * sizeof(int)); + for (int i = 0; i < N; i++) { + vals[i] = i; + } + + shuffle(vals, N, sizeof(int)); + + struct hashmap *map; + shuffle(vals, N, sizeof(int)); + + map = hashmap_new(sizeof(int), 0, seed, seed, hash_int, compare_ints_udata, + NULL, NULL); + bench("set", N, { + const int *v = hashmap_set(map, &vals[i]); + assert(!v); + }) + shuffle(vals, N, sizeof(int)); + bench("get", N, { + const int *v = hashmap_get(map, &vals[i]); + assert(v && *v == vals[i]); + }) + shuffle(vals, N, sizeof(int)); + bench("delete", N, { + const int *v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + }) + hashmap_free(map); + + map = hashmap_new(sizeof(int), N, seed, seed, hash_int, compare_ints_udata, + NULL, NULL); + bench("set (cap)", N, { + const int *v = hashmap_set(map, &vals[i]); + assert(!v); + }) + shuffle(vals, N, sizeof(int)); + bench("get (cap)", N, { + const int *v = hashmap_get(map, &vals[i]); + assert(v && *v == vals[i]); + }) + shuffle(vals, N, sizeof(int)); + bench("delete (cap)" , N, { + const int *v = hashmap_delete(map, &vals[i]); + assert(v && *v == vals[i]); + }) + + hashmap_free(map); + + + xfree(vals); + + if (total_allocs != 0) { + fprintf(stderr, "total_allocs: expected 0, got %lu\n", total_allocs); + exit(1); + } +} + +int main(void) { + hashmap_set_allocator(xmalloc, xfree); + + if (getenv("BENCH")) { + printf("Running hashmap.c benchmarks...\n"); + benchmarks(); + } else { + printf("Running hashmap.c tests...\n"); + all(); + printf("PASSED\n"); + } +} + + +#endif + + + diff --git a/gumbo-parser/src/hashmap.h b/gumbo-parser/src/hashmap.h new file mode 100644 index 00000000000..e22990e0453 --- /dev/null +++ b/gumbo-parser/src/hashmap.h @@ -0,0 +1,62 @@ +// Copyright 2020 Joshua J Baker. All rights reserved. +// Use of this source code is governed by an MIT-style +// license that can be found in the LICENSE file. + +#ifndef HASHMAP_H +#define HASHMAP_H + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif // __cplusplus + +struct hashmap; + +struct hashmap *hashmap_new(size_t elsize, size_t cap, uint64_t seed0, + uint64_t seed1, + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1), + int (*compare)(const void *a, const void *b, void *udata), + void (*elfree)(void *item), + void *udata); + +struct hashmap *hashmap_new_with_allocator(void *(*malloc)(size_t), + void *(*realloc)(void *, size_t), void (*free)(void*), size_t elsize, + size_t cap, uint64_t seed0, uint64_t seed1, + uint64_t (*hash)(const void *item, uint64_t seed0, uint64_t seed1), + int (*compare)(const void *a, const void *b, void *udata), + void (*elfree)(void *item), + void *udata); + +void hashmap_free(struct hashmap *map); +void hashmap_clear(struct hashmap *map, bool update_cap); +size_t hashmap_count(struct hashmap *map); +bool hashmap_oom(struct hashmap *map); +const void *hashmap_get(struct hashmap *map, const void *item); +const void *hashmap_set(struct hashmap *map, const void *item); +const void *hashmap_delete(struct hashmap *map, const void *item); +const void *hashmap_probe(struct hashmap *map, uint64_t position); +bool hashmap_scan(struct hashmap *map, bool (*iter)(const void *item, void *udata), void *udata); +bool hashmap_iter(struct hashmap *map, size_t *i, void **item); + +uint64_t hashmap_sip(const void *data, size_t len, uint64_t seed0, uint64_t seed1); +uint64_t hashmap_murmur(const void *data, size_t len, uint64_t seed0, uint64_t seed1); +uint64_t hashmap_xxhash3(const void *data, size_t len, uint64_t seed0, uint64_t seed1); + +const void *hashmap_get_with_hash(struct hashmap *map, const void *key, uint64_t hash); +const void *hashmap_delete_with_hash(struct hashmap *map, const void *key, uint64_t hash); +const void *hashmap_set_with_hash(struct hashmap *map, const void *item, uint64_t hash); +void hashmap_set_grow_by_power(struct hashmap *map, size_t power); +void hashmap_set_load_factor(struct hashmap *map, double load_factor); + + +// DEPRECATED: use `hashmap_new_with_allocator` +void hashmap_set_allocator(void *(*malloc)(size_t), void (*free)(void*)); + +#if defined(__cplusplus) +} +#endif // __cplusplus + +#endif // HASHMAP_H diff --git a/gumbo-parser/src/string_set.c b/gumbo-parser/src/string_set.c new file mode 100644 index 00000000000..389b1cf73ed --- /dev/null +++ b/gumbo-parser/src/string_set.c @@ -0,0 +1,53 @@ +#include "string_set.h" + +#include +#include "hashmap.h" + +#define SEED0 0xf00ba2 +#define SEED1 0xfa1afe1 + +static int +string_compare(const void *a, const void *b, void *udata) { + return strcmp((const char *)a, (const char *)b); +} + +static uint64_t +string_hash(const void *item, uint64_t seed0, uint64_t seed1) { + const char *str = (const char *)item; + return hashmap_xxhash3(str, strlen(str), seed0, seed1); +} + +static uint64_t +string_hash2(const char* str, size_t len, uint64_t seed0, uint64_t seed1) { + return hashmap_xxhash3(str, len, seed0, seed1); +} + +GumboStringSet * +gumbo_string_set_new(size_t cap) +{ + return (GumboStringSet*)hashmap_new(sizeof(char *), cap, SEED0, SEED1, string_hash, string_compare, NULL, NULL); +} + +void gumbo_string_set_free(GumboStringSet *set) +{ + hashmap_free(set); +} + +void +gumbo_string_set_insert(GumboStringSet *set, const char *str) +{ + hashmap_set(set, str); +} + +int +gumbo_string_set_contains(GumboStringSet *set, const char *str) +{ + return hashmap_get(set, str) == NULL ? 0 : 1; +} + +int +gumbo_string_set_contains2(GumboStringSet *set, const char *str, size_t len) +{ + uint64_t hash = string_hash2(str, len, SEED0, SEED1); + return hashmap_get_with_hash(set, str, hash) == NULL ? 0 : 1; +} diff --git a/gumbo-parser/src/string_set.h b/gumbo-parser/src/string_set.h new file mode 100644 index 00000000000..5e08edd82cd --- /dev/null +++ b/gumbo-parser/src/string_set.h @@ -0,0 +1,22 @@ +#ifndef STRING_SET_H +#define STRING_SET_H + +#include + +#if defined(__cplusplus) +extern "C" { +#endif // __cplusplus + +typedef struct hashmap GumboStringSet; + +GumboStringSet* gumbo_string_set_new(size_t cap); +void gumbo_string_set_free(GumboStringSet *set); +void gumbo_string_set_insert(GumboStringSet *set, const char *str); +int gumbo_string_set_contains(GumboStringSet *set, const char *str); +int gumbo_string_set_contains2(GumboStringSet *set, const char *str, size_t len); + +#if defined(__cplusplus) +} +#endif // __cplusplus + +#endif // STRING_SET_H diff --git a/gumbo-parser/src/tokenizer.c b/gumbo-parser/src/tokenizer.c index bc341813516..e82dd873a4c 100644 --- a/gumbo-parser/src/tokenizer.c +++ b/gumbo-parser/src/tokenizer.c @@ -58,6 +58,10 @@ #include "utf8.h" #include "util.h" #include "vector.h" +#include "string_set.h" + +// Tuned this based on benchmark in https://github.com/sparklemotion/nokogiri/issues/2568 +#define GUMBO_ATTRIBUTES_LOOKUP_MIN_SIZE 16 // Compared against _temporary_buffer to determine if we're in // double-escaped script mode. @@ -99,6 +103,7 @@ typedef struct GumboInternalTagState { // attributes are added as soon as their attribute name state is complete, and // values are filled in by operating on _attributes.data[attributes.length-1]. GumboVector /* GumboAttribute */ _attributes; + GumboStringSet* _attributes_lookup; // If true, the next attribute value to be finished should be dropped. This // happens if a duplicate attribute name is encountered - we want to consume @@ -440,11 +445,9 @@ static StateResult emit_doctype(GumboParser* parser, GumboToken* output) { return EMIT_TOKEN; } -// Debug-only function that explicitly sets the attribute vector data to NULL so -// it can be asserted on tag creation, verifying that there are no memory leaks. static void mark_tag_state_as_empty(GumboTagState* tag_state) { - UNUSED_IF_NDEBUG(tag_state); tag_state->_name = NULL; + tag_state->_attributes_lookup = NULL; #ifndef NDEBUG tag_state->_attributes = kGumboEmptyVector; #endif @@ -461,6 +464,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) { output->v.start_tag.attributes = tag_state->_attributes; output->v.start_tag.is_self_closing = tag_state->_is_self_closing; tag_state->_last_start_tag = tag_state->_tag; + gumbo_string_set_free(tag_state->_attributes_lookup); mark_tag_state_as_empty(tag_state); gumbo_debug( "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag)); @@ -480,6 +484,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) { gumbo_destroy_attribute(tag_state->_attributes.data[i]); } gumbo_free(tag_state->_attributes.data); + gumbo_string_set_free(tag_state->_attributes_lookup); mark_tag_state_as_empty(tag_state); gumbo_debug( "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag)); @@ -508,6 +513,7 @@ static void abandon_current_tag(GumboParser* parser) { } gumbo_free(tag_state->_name); gumbo_free(tag_state->_attributes.data); + gumbo_string_set_free(tag_state->_attributes_lookup); mark_tag_state_as_empty(tag_state); gumbo_string_buffer_destroy(&tag_state->_buffer); gumbo_debug("Abandoning current tag.\n"); @@ -786,6 +792,8 @@ static void finish_attribute_name(GumboParser* parser) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; GumboTagState* tag_state = &tokenizer->_tag_state; GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes; + GumboStringSet* attributes_lookup = tag_state->_attributes_lookup; + char* attr_name = NULL; int max_attributes = parser->_options->max_attributes; if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) { @@ -796,32 +804,42 @@ static void finish_attribute_name(GumboParser* parser) { return; } + if (attributes->length >= GUMBO_ATTRIBUTES_LOOKUP_MIN_SIZE && tag_state->_attributes_lookup == NULL) { + // build the hash table of attributes + attributes_lookup = tag_state->_attributes_lookup = gumbo_string_set_new(GUMBO_ATTRIBUTES_LOOKUP_MIN_SIZE * 2); + for (unsigned int i = 0; i < attributes->length; ++i) { + GumboAttribute* attr = attributes->data[i]; + gumbo_string_set_insert(attributes_lookup, attr->name); + } + } + // May've been set by a previous attribute without a value; reset it here. tag_state->_drop_next_attr_value = false; assert(tag_state->_attributes.data); assert(tag_state->_attributes.capacity); - for (unsigned int i = 0; i < attributes->length; ++i) { - GumboAttribute* attr = attributes->data[i]; - if ( - strlen(attr->name) == tag_state->_buffer.length - && 0 == memcmp ( - attr->name, - tag_state->_buffer.data, - tag_state->_buffer.length - ) - ) { - // Identical attribute; bail. - add_duplicate_attr_error(parser); - reinitialize_tag_buffer(parser); - tag_state->_drop_next_attr_value = true; - return; + if (!attributes_lookup) { + for (unsigned int i = 0; i < attributes->length; ++i) { + GumboAttribute* attr = attributes->data[i]; + if (strlen(attr->name) == tag_state->_buffer.length + && 0 == memcmp(attr->name, tag_state->_buffer.data, tag_state->_buffer.length)) { + goto duplicate_attribute; + } + } + } else { + attr_name = gumbo_string_buffer_to_string(&tag_state->_buffer); + if (gumbo_string_set_contains(attributes_lookup, attr_name)) { + goto duplicate_attribute; } } GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute)); attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE; - copy_over_tag_buffer(parser, &attr->name); + if (attr_name) { + attr->name = attr_name; + } else { + copy_over_tag_buffer(parser, &attr->name); + } copy_over_original_tag_text ( parser, &attr->original_name, @@ -836,7 +854,19 @@ static void finish_attribute_name(GumboParser* parser) { &attr->name_end ); gumbo_vector_add(attr, attributes); + if (attributes_lookup) { + gumbo_string_set_insert(attributes_lookup, attr->name); + } + reinitialize_tag_buffer(parser); + return; + +duplicate_attribute: + // Identical attribute; bail. + gumbo_free(attr_name); + add_duplicate_attr_error(parser); reinitialize_tag_buffer(parser); + tag_state->_drop_next_attr_value = true; + return; } // Finishes an attribute value. This sets the value of the most recently added diff --git a/gumbo-parser/test/tokenizer.cc b/gumbo-parser/test/tokenizer.cc index 9cde694cf07..139dda10f27 100644 --- a/gumbo-parser/test/tokenizer.cc +++ b/gumbo-parser/test/tokenizer.cc @@ -4684,6 +4684,42 @@ TEST_F(GumboTokenizerTest, Data_MultipleAttributes) { NextChar('z'); } +TEST_F(GumboTokenizerTest, Data_DuplicateAttributes) { + SetInput(""); + NextStartTag(GUMBO_TAG_SPAN, true); + + Error(GUMBO_ERR_DUPLICATE_ATTRIBUTE); + ASSERT_EQ(3, token_.v.start_tag.attributes.length); + + GumboAttribute *attr = static_cast(token_.v.start_tag.attributes.data[0]); + EXPECT_STREQ("foo", attr->name); + EXPECT_STREQ("123", attr->value); + + attr = static_cast(token_.v.start_tag.attributes.data[1]); + EXPECT_STREQ("bar", attr->name); + EXPECT_STREQ("456", attr->value); + + attr = static_cast(token_.v.start_tag.attributes.data[2]); + EXPECT_STREQ("baz", attr->name); + EXPECT_STREQ("abc", attr->value); +} + +TEST_F(GumboTokenizerTest, Data_DuplicateAttributesWithHashtable) { + SetInput(""); + NextStartTag(GUMBO_TAG_SPAN, true); + + Error(GUMBO_ERR_DUPLICATE_ATTRIBUTE); + ASSERT_EQ(26, token_.v.start_tag.attributes.length); + + GumboAttribute *attr = static_cast(token_.v.start_tag.attributes.data[0]); + EXPECT_STREQ("a", attr->name); + EXPECT_STREQ("1", attr->value); + + attr = static_cast(token_.v.start_tag.attributes.data[25]); + EXPECT_STREQ("z", attr->name); + EXPECT_STREQ("1", attr->value); +} + TEST_F(GumboTokenizerTest, Data_LT_Alpha_Slash_GT) { SetInput("
z"); NextStartTag(GUMBO_TAG_BR); diff --git a/nokogiri.gemspec b/nokogiri.gemspec index e5c8b0effa4..83f631ff796 100644 --- a/nokogiri.gemspec +++ b/nokogiri.gemspec @@ -198,15 +198,19 @@ Gem::Specification.new do |spec| "gumbo-parser/src/error.h", "gumbo-parser/src/foreign_attrs.c", "gumbo-parser/src/foreign_attrs.gperf", - "gumbo-parser/src/nokogiri_gumbo.h", + "gumbo-parser/src/hashmap.c", + "gumbo-parser/src/hashmap.h", "gumbo-parser/src/insertion_mode.h", "gumbo-parser/src/macros.h", + "gumbo-parser/src/nokogiri_gumbo.h", "gumbo-parser/src/parser.c", "gumbo-parser/src/parser.h", "gumbo-parser/src/replacement.h", "gumbo-parser/src/string_buffer.c", "gumbo-parser/src/string_buffer.h", "gumbo-parser/src/string_piece.c", + "gumbo-parser/src/string_set.c", + "gumbo-parser/src/string_set.h", "gumbo-parser/src/svg_attrs.c", "gumbo-parser/src/svg_attrs.gperf", "gumbo-parser/src/svg_tags.c", diff --git a/test/html5/test_attributes.rb b/test/html5/test_attributes.rb index 0dbfb6f13ea..9cef22c7edc 100644 --- a/test/html5/test_attributes.rb +++ b/test/html5/test_attributes.rb @@ -15,4 +15,16 @@ def test_serialize_attribute assert_equal('id="foo"', id_attr.to_html) assert_equal('class="bar baz"', class_attr.to_html) end + + def test_duplicate_attributes + html = +"" + span = Nokogiri::HTML5::DocumentFragment.parse(html, max_attributes: 1000).at_css("span") + + assert_equal(676, span.attributes.length, "duplicate attribute should be silently ignored") + assert_equal("1", span["bb"], "bb attribute should hold the value of the first occurrence") + end end if Nokogiri.uses_gumbo?