Context Navigation

source: mainline/kernel/generic/src/adt/cht.c@ 3bb732b

Visit:

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export

Last change on this file since 3bb732b was 3bb732b, checked in by Adam Hraska <adam.hraska+hos@…>, 13 years ago
cht: Implemented insert, resize. Heavy work in progress. Excluded from build.
Property mode set to `100644`
File size: 47.1 KB

Rev	Line
[7ef2249]	1	/*
	2	* Copyright (c) 2012 Adam Hraska
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	*
	9	* - Redistributions of source code must retain the above copyright
	10	* notice, this list of conditions and the following disclaimer.
	11	* - Redistributions in binary form must reproduce the above copyright
	12	* notice, this list of conditions and the following disclaimer in the
	13	* documentation and/or other materials provided with the distribution.
	14	* - The name of the author may not be used to endorse or promote products
	15	* derived from this software without specific prior written permission.
	16	*
	17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	27	*/
	28
	29	/** @addtogroup genericadt
	30	* @{
	31	*/
	32
	33	/**
	34	* @file
	35	* @brief Concurrent resizable lock-free hash table.
	36	*
	37	*/
	38
	39	#include <adt/cht.h>
	40	#include <debug.h>
	41	#include <memstr.h>
	42	#include <mm/slab.h>
	43	#include <barrier.h>
	44	#include <compiler/barrier.h>
	45	#include <atomic.h>
	46	#include <synch/rcu.h>
	47
[3bb732b]	48	/* Logarithm of the min bucket count. */
	49	#define CHT_MIN_ORDER 6
	50	/* Logarithm of the max bucket count. */
	51	#define CHT_MAX_ORDER (8 * sizeof(size_t))
	52	/* Minimum number of hash table buckets. */
	53	#define CHT_MIN_BUCKET_CNT (1 << CHT_MIN_ORDER)
[7ef2249]	54	/* Must be a power of 2. */
	55	#define CHT_MAX_LOAD 2
	56
	57	typedef cht_ptr_t marked_ptr_t;
	58	typedef bool (equal_pred_t)(void arg, const cht_link_t *item);
	59
	60	typedef enum mark {
	61	N_NORMAL = 0,
	62	N_DELETED = 1,
	63	N_INVALID = 1,
	64	N_CONST = 3,
	65	N_JOIN = 2,
	66	N_JOIN_FOLLOWS = 2,
	67	N_MARK_MASK = 3
	68	} mark_t;
	69
	70	typedef enum walk_mode {
	71	WM_NORMAL = 4,
	72	WM_LEAVE_JOIN,
	73	WM_MOVE_JOIN_FOLLOWS
	74	} walk_mode_t;
	75
	76	typedef struct wnd {
	77	marked_ptr_t *ppred;
	78	cht_link_t *cur;
	79	cht_link_t *last;
	80	} wnd_t;
	81
	82
	83	static size_t size_to_order(size_t bucket_cnt);
	84	static cht_buckets_t *alloc_buckets(size_t order);
	85
	86	static marked_ptr_t make_link(cht_link_t *next, mark_t mark);
	87	static cht_link_t * get_next(marked_ptr_t link);
	88	static mark_t get_mark(marked_ptr_t link);
	89
	90	static size_t key_hash(cht_t h, void key);
	91	static size_t node_hash(cht_t h, const cht_link_t item);
	92
[3bb732b]	93	static size_t calc_split_hash(size_t split_idx, size_t order);
[7ef2249]	94	static size_t calc_bucket_idx(size_t hash, size_t order);
	95	static size_t grow_idx(size_t idx);
	96	static size_t shrink_idx(size_t idx);
	97
	98
	99
	100	bool cht_create(cht_t h, size_t init_size, cht_ops_t op)
	101	{
	102	ASSERT(h);
	103	ASSERT(op && op->hash && op->key_hash && op->equal && op->key_equal);
	104
[3bb732b]	105	/* All operations are compulsory. */
[7ef2249]	106	if (!op \|\| !op->hash \|\| !op->key_hash \|\| !op->equal \|\| !op->key_equal)
	107	return false;
	108
	109	size_t order = size_to_order(init_size);
	110
	111	h->b = alloc_buckets(order);
	112
	113	if (!h->b)
	114	return false;
	115
	116	h->new_b = 0;
	117	h->op = op;
	118	atomic_set(&h->item_cnt, 0);
	119	atomic_set(&h->resize_reqs, 0);
	120	/* Ensure the initialization takes place before we start using the table. */
	121	write_barrier();
	122
	123	return true;
	124	}
	125
[3bb732b]	126	static cht_buckets_t *alloc_buckets(size_t order, bool set_invalid)
	127	{
	128	size_t bucket_cnt = (1 << order);
	129	cht_buckets_t *b = malloc(
	130	sizeof(cht_buckets_t) + (bucket_cnt - 1) * sizeof(marked_ptr_t));
	131
	132	if (!b)
	133	return 0;
	134
	135	b->order = order;
	136
	137	marked_ptr_t head_link
	138	= set_invalid ? make_link(0, N_INVALID) : make_link(0, N_NORMAL);
	139
	140	for (size_t i = 0; i < bucket_cnt; ++i) {
	141	b->head[i] = head_link;
	142	}
	143
	144	return b;
	145	}
	146
	147	static size_t size_to_order(size_t bucket_cnt)
	148	{
	149	size_t order = CHT_MIN_ORDER;
	150
	151	/* Find a power of two such that bucket_cnt <= 2^order */
	152	do {
	153	if (bucket_cnt <= (1 << order))
	154	return order;
	155
	156	++order;
	157	} while (order < CHT_MAX_ORDER);
	158
	159	return order;
	160	}
	161
	162
[7ef2249]	163	void cht_destroy(cht_t *h)
	164	{
	165	/* todo: impl */
	166	}
	167
	168	cht_link_t cht_find(cht_t h, void *key)
	169	{
[3bb732b]	170	/* Make the most recent changes of the table visible. */
[7ef2249]	171	read_barrier();
	172	return cht_find_lazy(h, key);
	173	}
	174
	175
	176	cht_link_t cht_find_lazy(cht_t h, void *key)
	177	{
	178	ASSERT(h);
	179	ASSERT(rcu_read_locked());
	180
	181	size_t hash = key_hash(h, key);
	182
	183	cht_buckets_t *b = rcu_access(h->b);
	184	size_t idx = calc_bucket_idx(hash, b->order);
[3bb732b]	185	/*
	186	* No need for access_once. b->head[idx] will point to an allocated node
	187	* even if marked invalid until we exit rcu read section.
	188	*/
[7ef2249]	189	marked_ptr_t head = b->head[idx];
	190
	191	if (N_INVALID == get_mark(head))
	192	return find_resizing(h, key, hash, head, idx);
	193
	194	return search_bucket(h, head, key, hash);
	195	}
	196
	197
	198	static cht_link_t search_bucket(cht_t h, marked_ptr_t head, void *key,
	199	size_t search_hash)
	200	{
	201	cht_link_t *cur = get_next(head);
	202
	203	while (cur) {
	204	/*
	205	* It is safe to access nodes even outside of this bucket (eg when
	206	* splitting the bucket). The resizer makes sure that any node we
	207	* may find by following the next pointers is allocated.
	208	*/
	209	size_t cur_hash = node_hash(cur);
	210
	211	if (cur_hash >= search_hash) {
	212	if (cur_hash != search_hash)
	213	return 0;
	214
	215	int present = !(N_DELETED & get_mark(cur->link));
	216	if (present && h->op->key_equal(key, cur))
	217	return cur;
	218	}
	219
	220	cur = get_next(cur->link);
	221	}
	222
	223	return 0;
	224	}
	225
	226	static cht_link_t find_resizing(cht_t h, void *key, size_t hash,
	227	marked_ptr_t old_head, size_t old_idx)
	228	{
	229	ASSERT(N_INVALID == get_mark(old_head));
	230	ASSERT(h->new_b);
	231
	232	size_t new_idx = calc_bucket_idx(hash, h->new_b->order);
	233	marked_ptr_t new_head = h->new_b->head[new_idx];
	234	marked_ptr_t search_head = new_head;
	235
	236	/* Growing. */
	237	if (h->b->order < h->new_b->order) {
	238	/*
	239	* Old bucket head is invalid, so it must have been already
	240	* moved. Make the new head visible if still not visible, ie
	241	* invalid.
	242	*/
	243	if (N_INVALID == get_mark(new_head)) {
	244	/*
	245	* We should be searching a newly added bucket but the old
	246	* moved bucket has not yet been split (its marked invalid)
	247	* or we have not yet seen the split.
	248	*/
	249	if (grow_idx(old_idx) != new_idx) {
	250	/*
	251	* Search the moved bucket. It is guaranteed to contain
	252	* items of the newly added bucket that were present
	253	* before the moved bucket was split.
	254	*/
	255	new_head = h->new_b->head[grow_idx(old_idx)];
	256	}
	257
	258	/* new_head is now the moved bucket, either valid or invalid. */
	259
	260	/*
	261	* The old bucket was definitely moved to new_head but the
	262	* change of new_head had not yet propagated to this cpu.
	263	*/
	264	if (N_INVALID == get_mark(new_head)) {
	265	/*
	266	* We could issue a read_barrier() and make the now valid
	267	* moved bucket head new_head visible, but instead fall back
	268	* on using the old bucket. Although the old bucket head is
	269	* invalid, it points to a node that is allocated and in the
	270	* right bucket. Before the node can be freed, it must be
	271	* unlinked from the head (or another item after that item
	272	* modified the new_head) and a grace period must elapse.
	273	* As a result had the node been already freed the grace
	274	* period preceeding the free() would make the unlink and
	275	* any changes to new_head visible. Therefore, it is safe
	276	* to use the node pointed to from the old bucket head.
	277	*/
	278
	279	search_head = old_head;
	280	} else {
	281	search_head = new_head;
	282	}
	283	}
	284
	285	return search_bucket(h, search_head, key, hash);
	286	} else if (h->b->order > h->new_b->order) {
	287	/* Shrinking. */
	288
	289	/* Index of the bucket in the old table that was moved. */
	290	size_t move_src_idx = grow_idx(new_idx);
	291	marked_ptr_t moved_old_head = h->b->head[move_src_idx];
	292
	293	/*
	294	* h->b->head[move_src_idx] had already been moved to new_head
	295	* but the change to new_head had not yet propagated to us.
	296	*/
	297	if (N_INVALID == get_mark(new_head)) {
	298	/*
	299	* new_head is definitely valid and we could make it visible
	300	* to this cpu with a read_barrier(). Instead, use the bucket
	301	* in the old table that was moved even though it is now marked
	302	* as invalid. The node it points to must be allocated because
	303	* a grace period would have to elapse before it could be freed;
	304	* and the grace period would make the now valid new_head
	305	* visible to all cpus.
	306	*
	307	* Note that move_src_idx may not be the same as old_idx.
	308	* If move_src_idx != old_idx then old_idx is the bucket
	309	* in the old table that is not moved but instead it is
	310	* appended to the moved bucket, ie it is added at the tail
	311	* of new_head. In that case an invalid old_head notes that
	312	* it had already been merged into (the moved) new_head.
	313	* We will try to search that bucket first because it
	314	* may contain some newly added nodes after the bucket
	315	* join. Moreover, the bucket joining link may already be
	316	* visible even if new_head is not. Therefore, if we're
	317	* lucky we'll find the item via moved_old_head. In any
	318	* case, we'll retry in proper old_head if not found.
	319	*/
	320	search_head = moved_old_head;
	321	}
	322
	323	cht_link_t *ret = search_bucket(h, search_head, key, hash);
	324
	325	if (ret)
	326	return ret;
	327	/*
	328	* Bucket old_head was already joined with moved_old_head
	329	* in the new table but we have not yet seen change of the
	330	* joining link (or the item is not in the table).
	331	*/
	332	if (move_src_idx != old_idx && get_next(old_head)) {
	333	/*
	334	* Note that old_head (the bucket to be merged into new_head)
	335	* points to an allocated join node (if non-null) even if marked
	336	* invalid. Before the resizer lets join nodes to be unlinked
	337	* (and freed) it sets old_head to 0 and waits for a grace period.
	338	* So either the invalid old_head points to join node; or old_head
	339	* is null and we would have seen a completed bucket join while
	340	* traversing search_head.
	341	*/
	342	ASSERT(N_JOIN & get_mark(get_next(old_head)->link));
	343	return search_bucket(h, old_head, key, hash);
	344	}
	345
	346	return 0;
	347	} else {
	348	/*
	349	* Resize is almost done. The resizer is waiting to make
	350	* sure all cpus see that the new table replaced the old one.
	351	*/
	352	ASSERT(h->b->order == h->new_b->order);
	353	/*
	354	* The resizer must ensure all new bucket heads are visible before
	355	* replacing the old table.
	356	*/
	357	ASSERT(N_NORMAL == get_mark(new_head));
	358	return search_bucket(h, new_head, key, hash);
	359	}
	360	}
	361
	362
	363	void cht_insert(cht_t h, cht_link_t item)
	364	{
	365	return insert_impl(h, item, true);
	366	}
	367
	368	bool cht_insert_unique(cht_t h, cht_link_t item)
	369	{
	370	insert_impl(h, item, false);
	371	}
	372
	373	bool insert_impl(cht_t h, cht_link_t item, bool unique)
	374	{
	375	rcu_read_lock();
	376
[3bb732b]	377	cht_buckets_t *b = rcu_access(h->b);
	378	size_t hash = node_hash(h, item);
	379	size_t idx = calc_bucket_idx(hash, b->order);
	380	marked_ptr_t *phead = &b->head[idx];
	381
	382	bool resizing = false;
	383	bool inserted;
	384
	385	do {
	386	walk_mode_t walk_mode = WM_NORMAL;
	387	bool join_finishing;
	388
	389	resizing = resizing \|\| (N_NORMAL != get_mark(*phead));
	390
	391	/* The table is resizing. Get the correct bucket head. */
	392	if (resizing) {
	393	upd_resizing_head(hash, &phead, &join_finishing, &walk_mode);
	394	}
	395
	396	wnd_t wnd = {
	397	.ppred = phead,
	398	.cur = get_next(*phead),
	399	.last = 0
	400	};
	401
	402	if (!find_wnd_and_gc(h, hash, walk_mode, &wnd, &resizing)) {
	403	/* Could not GC a node; or detected an unexpected resize. */
	404	continue;
	405	}
	406
	407	if (unique && has_duplicates(h, item, hash, wnd)) {
	408	rcu_read_unlock();
	409	return false;
	410	}
	411
	412	inserted = insert_at(item, wnd, walk_mode, &resizing);
	413	} while (!inserted);
	414
	415	item_inserted(h);
[7ef2249]	416
	417	rcu_read_unlock();
[3bb732b]	418	return true;
	419	}
	420
	421	static bool insert_at(cht_link_t item, const wnd_t wnd, walk_mode_t walk_mode,
	422	bool *resizing)
	423	{
	424	marked_ptr_t ret;
	425
	426	if (walk_mode == WM_NORMAL) {
	427	item->link = make_link(wnd->cur, N_NORMAL);
	428	/* Initialize the item before adding it to a bucket. */
	429	memory_barrier();
	430
	431	/* Link a clean/normal predecessor to the item. */
	432	ret = cas_link(wnd->ppred, wnd->cur, N_NORMAL, item, N_NORMAL);
	433
	434	if (ret == make_link(wnd->cur, N_NORMAL)) {
	435	return true;
	436	} else {
	437	*resizing = ((N_JOIN_FOLLOWS \| N_JOIN) & get_mark(ret));
	438	return false;
	439	}
	440	} else if (walk_mode == WM_MOVE_JOIN_FOLLOWS) {
	441	/* Move JOIN_FOLLOWS mark but filter out the DELETED mark. */
	442	mark_t jf_mark = get_mark(*wnd->ppred) & N_JOIN_FOLLOWS;
	443	item->link = make_link(wnd->cur, jf_mark);
	444	/* Initialize the item before adding it to a bucket. */
	445	memory_barrier();
	446
	447	/* Link the not-deleted predecessor to the item. Move its JF mark. */
	448	ret = cas_link(wnd->ppred, wnd->cur, jf_mark, item, N_NORMAL);
	449
	450	return ret == make_link(wnd->cur, jf_mark);
	451	} else {
	452	ASSERT(walk_mode == WM_LEAVE_JOIN);
	453
	454	item->link = make_link(wnd->cur, N_NORMAL);
	455	/* Initialize the item before adding it to a bucket. */
	456	memory_barrier();
	457
	458	mark_t pred_mark = get_mark(*wnd->ppred);
	459	/* If the predecessor is a join node it may be marked deleted.*/
	460	mark_t exp_pred_mark = (N_JOIN & pred_mark) ? pred_mark : N_NORMAL;
	461
	462	ret = cas_link(wnd->ppred, wnd->cur, exp_pred_mark, item, exp_pred_mark);
	463	return ret == make_link(wnd->cur, exp_pred_mark);
	464	}
	465	}
	466
	467	static bool has_duplicates(cht_t h, cht_link_t item, size_t hash,
	468	const wnd_t *cwnd)
	469	{
	470	ASSERT(0 == wnd->cur \|\| hash <= node_hash(h, wnd->cur));
	471
	472	if (0 == wnd->cur \|\| hash < node_hash(h, wnd->cur))
	473	return false;
	474
	475	/*
	476	* Load the most recent node marks. Otherwise we might pronounce a
	477	* logically deleted node for a duplicate of the item just because
	478	* the deleted node's DEL mark had not yet propagated to this cpu.
	479	*/
	480	read_barrier();
	481
	482	cht_link_t *cur = wnd->cur;
	483
	484	do {
	485	bool deleted = (N_DELETED & get_mark(cur->link));
	486
	487	/* Skip logically deleted nodes. */
	488	if (!deleted && h->op->equal(item, cur))
	489	return true;
	490
	491	cur = get_next(cur->link);
	492	} while (cur && node_hash(h, cur) == hash);
	493
	494	return false;
[7ef2249]	495	}
	496
	497
	498	size_t cht_remove_key(cht_t h, void key)
	499	{
	500	ASSERT(h);
	501
	502	size_t hash = key_hash(h, key);
	503	size_t removed = 0;
	504
	505	while (remove_pred(h, hash, h->op->key_equal, key))
	506	++removed;
	507
	508	return removed;
	509	}
	510
	511	bool cht_remove_item(cht_t h, cht_link_t item)
	512	{
	513	ASSERT(h);
	514	ASSERT(item);
	515
	516	/*
	517	* Even though we know the node we want to delete we must unlink it
	518	* from the correct bucket and from a clean/normal predecessor. Therefore,
	519	* we search for it again from the beginning of the correct bucket.
	520	*/
	521	size_t hash = node_hash(h, item);
	522	return remove_pred(h, hash, same_node_pred, item);
	523	}
	524
	525
	526	static bool remove_pred(cht_t h, size_t hash, equal_pred_t pred, void pred_arg)
	527	{
	528	rcu_read_lock();
	529
	530	bool resizing = false;
	531	bool deleted = false;
	532	bool deleted_but_gc = false;
	533
	534	cht_buckets_t *b = rcu_access(h->b);
	535	size_t idx = calc_bucket_idx(hash, b->order);
	536	marked_ptr_t *phead = &b->head[idx];
	537
	538	do {
	539	walk_mode_t walk_mode = WM_NORMAL;
	540	bool join_finishing = false;
	541
	542	resizing = resizing \|\| (N_NORMAL != get_mark(*phead));
	543
	544	/* The table is resizing. Get the correct bucket head. */
	545	if (resizing) {
	546	upd_resizing_head(hash, &phead, &join_finishing, &walk_mode);
	547	}
	548
	549	wnd_t wnd = {
	550	.ppred = phead,
	551	.cur = get_next(*phead),
	552	.last = 0
	553	};
	554
	555	if (!find_wnd_and_gc_pred(
	556	h, hash, walk_mode, pred, pred_arg, &wnd, &resizing)) {
	557	/* Could not GC a node; or detected an unexpected resize. */
	558	continue;
	559	}
	560
	561	/*
	562	* The item lookup is affected by a bucket join but effects of
	563	* the bucket join have not been seen while searching for the item.
	564	*/
	565	if (join_finishing && !join_completed(h, &wnd)) {
	566	/*
	567	* Bucket was appended at the end of another but the next
	568	* ptr linking them together was not visible on this cpu.
	569	* join_completed() makes this appended bucket visible.
	570	*/
	571	continue;
	572	}
	573
	574	/* Already deleted, but delete_at() requested one GC pass. */
	575	if (deleted_but_gc)
	576	break;
	577
	578	bool found = wnd.cur && pred(pred_arg, wnd.cur);
	579
	580	if (!found) {
	581	rcu_read_unlock();
	582	return false;
	583	}
	584
	585	deleted = delete_at(wnd, walk_mode, &deleted_but_gc, &resizing);
	586	} while (!deleted \|\| deleted_but_gc);
	587
	588	rcu_read_unlock();
	589	return true;
	590	}
	591
	592
	593	static bool delete_at(cht_t h, wnd_t wnd, walk_mode_t walk_mode,
	594	bool deleted_but_gc, bool resizing)
	595	{
	596	ASSERT(wnd->cur);
	597
	598	*deleted_but_gc = false;
	599
	600	if (!mark_deleted(wnd->cur, walk_mode, resizing)) {
	601	/* Already deleted, or unexpectedly marked as JOIN/JOIN_FOLLOWS. */
	602	return false;
	603	}
	604
	605	/* Marked deleted. Unlink from the bucket. */
	606
	607	/* Never unlink join nodes. */
	608	if (walk_mode == WM_LEAVE_JOIN && (N_JOIN & get_mark(wnd->cur->link)))
	609	return true;
	610
	611	if (unlink_from_pred(wnd, walk_mode, resizing)) {
	612	free_later(h, wnd->cur);
	613	} else {
	614	*deleted_but_gc = true;
	615	}
	616
	617	return true;
	618	}
	619
	620	static bool mark_deleted(cht_link_t cur, walk_mode_t walk_mode, bool resizing)
	621	{
	622	ASSERT(cur);
	623
	624	/*
	625	* Btw, we could loop here if the cas fails but let's not complicate
	626	* things and let's retry from the head of the bucket.
	627	*/
	628
	629	cht_link_t *next = get_next(cur->link);
	630
	631	if (walk_mode == WM_NORMAL) {
	632	/* Only mark clean/normal nodes - JF/JN is used only during resize. */
	633	marked_ptr_t normal_link = make_link(next, N_NORMAL);
	634	marked_ptr_t del_link = make_link(next, N_DELETED);
	635
	636	marked_ptr_t ret = cas_link(&cur->link, normal_link, del_link);
	637
	638	if (normal_link != ret) {
	639	*resizing = (N_JOIN \| N_JOIN_FOLLOWS \| N_INVALID) & get_mark(ret);
	640	return false;
	641	}
	642	} else {
	643	ASSERT(N_JOIN == N_JOIN_FOLLOWS);
	644
	645	/* Keep the N_JOIN/N_JOIN_FOLLOWS mark but strip N_DELETED. */
	646	mark_t cur_mark = get_mark(cur->link) & N_JOIN_FOLLOWS;
	647
	648	marked_ptr_t nondel_link = make_link(next, cur_mark);
	649	marked_ptr_t del_link = make_link(next, cur_mark \| N_DELETED);
	650
	651	if (nondel_link != cas_link(&cur->link, nondel_link, del_link))
	652	return false;
	653	}
	654
	655	return true;
	656	}
	657
	658	static bool unlink_from_pred(wnd_t wnd, walk_mode_t walk_mode, bool resizing)
	659	{
	660	ASSERT(wnd->cur && (N_DELETED & get_mark(wnd->cur->link)));
	661
	662	cht_link_t *next = get_next(wnd->cur->link);
	663
	664	if (walk_mode == WM_LEAVE_JOIN) {
	665	/* Never try to unlink join nodes. */
	666	ASSERT(!(N_JOIN & get_mark(wnd->cur->link)));
	667
	668	mark_t pred_mark = get_mark(*wnd->ppred);
	669	/* Succeed only of the predecessor is clean/normal or a join node. */
	670	mark_t exp_pred_mark = (N_JOIN & pred_mark) ? pred_mark : N_NORMAL;
	671
	672	marked_ptr_t pred_link = make_link(wnd->cur, exp_pred_mark);
	673	marked_ptr_t next_link = make_link(next, exp_pred_mark);
	674
	675	if (pred_link != cas_link(wnd->ppred, pred_link, next_link))
	676	return false;
	677	} else {
	678	ASSERT(walk_mode == WM_MOVE_JOIN_FOLLOWS \|\| walk_mode == WM_NORMAL);
	679	/* Move the JF mark if set. Clear DEL mark. */
	680	mark_t cur_mark = N_JOIN_FOLLOWS & get_mark(wnd->cur->link);
	681
	682	/* The predecessor must be clean/normal. */
	683	marked_ptr_t pred_link = make_link(wnd->cur, N_NORMAL);
	684	/* Link to cur's successor keeping/copying cur's JF mark. */
	685	marked_ptr_t next_link = make_link(next, cur_mark);
	686
	687	marked_ptr_t ret = cas_link(wnd->ppred, pred_link, next_link);
	688
	689	if (pred_link != ret) {
	690	/* If we're not resizing the table there are no JF/JN nodes. */
	691	*resizing = (walk_mode == WM_NORMAL)
	692	&& (N_JOIN_FOLLOWS & get_mark(ret));
	693	return false;
	694	}
	695	}
	696
	697	return true;
	698	}
	699
	700
	701	static bool find_wnd_and_gc_pred(cht_t *h, size_t hash, walk_mode_t walk_mode,
	702	equal_pred_t pred, void pred_arg, wnd_t wnd, bool *resizing)
	703	{
	704	if (!wnd->cur)
	705	return true;
	706
	707	/*
	708	* A read barrier is not needed here to bring up the most recent
	709	* node marks (esp the N_DELETED). At worst we'll try to delete
	710	* an already deleted node; fail in delete_at(); and retry.
	711	*/
	712
	713	size_t cur_hash = node_hash(h, wnd->cur);
	714
	715	while (cur_hash <= hash) {
	716	/* GC any deleted nodes on the way. */
	717	if (N_DELETED & get_mark(wnd->cur->link)) {
	718	if (!gc_deleted_node(h, walk_mode, wnd, resizing)) {
	719	/* Retry from the head of a bucket. */
	720	return false;
	721	}
	722	} else {
	723	/* Is this the node we were looking for? */
	724	if (cur_hash == hash && pred(pred_arg, wnd->cur))
	725	return true;
	726
	727	next_wnd(wnd);
	728	}
	729
	730	/* The searched for node is not in the current bucket. */
[3bb732b]	731	if (!wnd->cur)
[7ef2249]	732	return true;
	733
	734	cur_hash = node_hash(h, wnd->cur);
	735	}
	736
	737	/* The searched for node is not in the current bucket. */
	738	return true;
	739	}
	740
	741	/* todo: comment different semantics (eg deleted JN first w/ specific hash) */
	742	static bool find_wnd_and_gc(cht_t *h, size_t hash, walk_mode_t walk_mode,
	743	wnd_t wnd, bool resizing)
	744	{
	745	while (wnd->cur && node_hash(h, wnd->cur) < hash) {
	746	/* GC any deleted nodes along the way to our desired node. */
	747	if (N_DELETED & get_mark(wnd->cur->link)) {
	748	if (!gc_deleted_node(h, walk_mode, wnd, resizing)) {
	749	/* Failed to remove the garbage node. Retry. */
	750	return false;
	751	}
	752	} else {
	753	next_wnd(wnd);
	754	}
	755	}
	756
	757	/* wnd->cur may be 0 or even marked N_DELETED. */
	758	return true;
	759	}
	760
	761	static bool gc_deleted_node(cht_t h, walk_mode_t walk_mode, wnd_t wnd,
	762	bool *resizing)
	763	{
	764	ASSERT(N_DELETED & get_mark(wnd->cur->link));
	765
	766	/* Skip deleted JOIN nodes. */
	767	if (walk_mode == WM_LEAVE_JOIN && (N_JOIN & get_mark(wnd->cur->link))) {
	768	next_wnd(wnd);
	769	} else {
	770	/* Ordinary deleted node or a deleted JOIN_FOLLOWS. */
	771	ASSERT(walk_mode != WM_LEAVE_JOIN
	772	\|\| !((N_JOIN \| N_JOIN_FOLLOWS) & get_mark(wnd->cur->link)));
	773
	774	/* Unlink an ordinary deleted node, move JOIN_FOLLOWS mark. */
	775	if (!unlink_from_pred(wnd, walk_mode, resizing)) {
	776	/* Retry. The predecessor was deleted, invalid, const, join_follows. */
	777	return false;
	778	}
	779
	780	free_later(h, wnd->cur);
	781
	782	/* Leave ppred as is. */
	783	wnd->last = wnd->cur;
	784	wnd->cur = get_next(wnd->cur->link);
	785	}
	786
	787	return true;
	788	}
	789
	790	static bool join_completed(cht_t h, const wnd_t wnd)
	791	{
	792	/*
	793	* The table is shrinking and the searched for item is in a bucket
	794	* appended to another. Check that the link joining these two buckets
	795	* is visible and if not, make it visible to this cpu.
	796	*/
	797
	798	/*
	799	* Resizer ensures h->b->order stays the same for the duration of this
	800	* func. We got here because there was an alternative head to search.
	801	* The resizer waits for all preexisting readers to finish after
	802	* it
	803	*/
	804	ASSERT(h->b->order > h->new_b->order);
	805
	806	/* Either we did not need the joining link or we have already followed it.*/
	807	if (wnd->cur)
	808	return true;
	809
	810	/* We have reached the end of a bucket. */
	811
	812	if (wnd->last) {
	813	size_t last_seen_hash = node_hash(h, wnd->last);
	814	size_t last_old_idx = calc_bucket_idx(last_seen_hash, h->b->order);
	815	size_t move_src_idx = grow_idx(shrink_idx(last_old_idx));
	816
	817	/*
	818	* Last was in the joining bucket - if the searched for node is there
	819	* we will find it.
	820	*/
	821	if (move_src_idx != last_old_idx)
	822	return true;
	823	}
	824
	825	/*
	826	* Reached the end of the bucket but no nodes from the joining bucket
	827	* were seen. There should have at least been a JOIN node so we have
	828	* definitely not seen (and followed) the joining link. Make the link
	829	* visible and retry.
	830	*/
	831	read_barrier();
	832	return false;
	833	}
	834
	835	static void upd_resizing_head(cht_t h, size_t hash, marked_ptr_t *phead,
	836	bool join_finishing, bool walk_mode)
	837	{
	838	cht_buckets_t *b = rcu_access(h->b);
	839	size_t old_idx = calc_bucket_idx(hash, b->order);
	840	size_t new_idx = calc_bucket_idx(hash, h->new_b->order);
	841
	842	marked_ptr_t *pold_head = &b->head[old_idx];
	843	marked_ptr_t *pnew_head = &h->new_b->head[new_idx];
	844
	845	/* In any case, use the bucket in the new table. */
	846	*phead = pnew_head;
	847
	848	/* Growing the table. */
	849	if (b->order < h->new_b->order) {
	850	size_t move_dest_idx = grow_idx(old_idx);
	851	marked_ptr_t *pmoved_head = &h->new_b->head[move_dest_idx];
	852
	853	/* Complete moving the bucket from the old to the new table. */
	854	move_head(pold_head, pmoved_head);
	855
	856	/* The hash belongs to the moved bucket. */
	857	if (move_dest_idx == new_idx) {
	858	/*
	859	* move_head() makes the new head of the moved bucket visible.
	860	* The new head may be marked with a JOIN_FOLLOWS
	861	*/
	862	ASSERT(!(N_CONST & get_mark(*pnew_head)));
	863	*walk_mode = WM_MOVE_JOIN_FOLLOWS;
	864	} else {
	865	/*
	866	* The hash belongs to the bucket that is the result of splitting
	867	* the old/moved bucket, ie the bucket that contains the second
	868	* half of the split/old/moved bucket.
	869	*/
	870
	871	/* The moved bucket has not yet been split. */
	872	if (N_NORMAL != get_mark(*pnew_head)) {
[3bb732b]	873	size_t split_hash = calc_split_hash(new_idx, h->new_b->order);
	874	split_bucket(pmoved_head, pnew_head, split_hash);
[7ef2249]	875	/*
	876	* split_bucket() makes the new head visible. No
	877	* JOIN_FOLLOWS in this part of split bucket.
	878	*/
	879	ASSERT(N_NORMAL == get_mark(*pnew_head));
	880	}
	881
	882	*walk_mode = WM_LEAVE_JOIN;
	883	}
	884	} else if (h->new_b->order < b->order ) {
	885	/* Shrinking the table. */
	886
	887	size_t move_src_idx = grow_idx(new_idx);
	888
	889	/*
	890	* Complete moving the bucket from the old to the new table.
	891	* Makes a valid pnew_head visible if already moved.
	892	*/
	893	move_head(&b->head[move_src_idx], pnew_head);
	894
	895	/* Hash belongs to the bucket to be joined with the moved bucket. */
	896	if (move_src_idx != old_idx) {
	897	/* Bucket join not yet completed. */
	898	if (N_INVALID != get_mark(*pold_head)) {
[3bb732b]	899	size_t split_hash = calc_split_hash(old_idx, b->order);
	900	join_buckets(pold_head, pnew_head, split_hash);
[7ef2249]	901	}
	902
	903	/* The resizer sets pold_head to 0 when all cpus see the bucket join.*/
	904	join_finishing = (0 != get_next(pold_head));
	905	}
	906
	907	/* move_head() or join_buckets() makes it so or makes the mark visible.*/
	908	ASSERT(N_INVALID == get_mark(*pold_head));
	909	/* move_head() makes it visible. No JOIN_FOLLOWS used when shrinking. */
	910	ASSERT(N_NORMAL == get_mark(*pnew_head));
	911
	912	*walk_mode = WM_LEAVE_JOIN;
	913	} else {
	914	/*
	915	* Final stage of resize. The resizer is waiting for all
	916	* readers to notice that the old table had been replaced.
	917	*/
	918	ASSERT(b == h->new_b);
	919	*walk_mode = WM_NORMAL;
	920	}
	921	}
	922
	923
[3bb732b]	924	#if 0
[7ef2249]	925	static void move_head(marked_ptr_t psrc_head, marked_ptr_t pdest_head)
	926	{
	927	start_head_move(psrc_head);
	928	complete_head_move(psrc_head, pdest_head);
	929	}
[3bb732b]	930	#endif
[7ef2249]	931
	932	static void help_head_move(marked_ptr_t psrc_head, marked_ptr_t pdest_head)
	933	{
	934	/* Head move has to in progress already when calling this func. */
	935	ASSERT(N_CONST & get_mark(*psrc_head));
	936
	937	/* Head already moved. */
	938	if (N_INVALID == get_mark(*psrc_head)) {
	939	/* Effects of the head move have not yet propagated to this cpu. */
	940	if (N_INVALID == get_mark(*pdest_head)) {
	941	/* Make the move visible on this cpu. */
	942	read_barrier();
	943	ASSERT(!(N_CONST & get_mark(*pdest_head)));
	944	}
	945	} else {
	946	complete_head_move(psrc_head, pdest_head);
	947	}
	948	}
	949
	950	static void start_head_move(marked_ptr_t *psrc_head)
	951	{
	952	/* Mark src head immutable. */
	953	mark_const(psrc_head);
	954	}
	955
	956	static void mark_const(marked_ptr_t *psrc_head)
	957	{
	958	marked_ptr_t ret, src_link;
	959
	960	/* Mark src head immutable. */
	961	do {
	962	cht_link_t next = get_next(psrc_head);
	963	src_link = make_link(next, N_NORMAL);
	964
	965	/* Mark the normal/clean src link immutable/const. */
	966	ret = cas_link(psrc_head, next, N_NORMAL, next, N_CONST);
	967	} while(ret != src_link && !(N_CONST & get_mark(ret)));
	968	}
	969
	970	static void complete_head_move(marked_ptr_t psrc_head, marked_ptr_t pdest_head)
	971	{
	972	ASSERT(N_JOIN_FOLLOWS != get_mark(*psrc_head));
	973	ASSERT(N_CONST & get_mark(*psrc_head));
	974
	975	cht_link_t next = get_next(psrc_head);
	976	/* todo: cas order barrier */
	977	cas_link(pdest_head, 0, N_INVALID, next, N_NORMAL);
	978	/* todo: cas order barrier */
	979	cas_link(psrc_head, next, N_CONST, next, N_INVALID);
	980	}
	981
	982	static void split_bucket(cht_t h, marked_ptr_t psrc_head,
	983	marked_ptr_t *pdest_head, size_t split_hash)
	984	{
	985	/* Already split. */
	986	if (N_NORMAL == get_mark(*pdest_head))
	987	return;
	988
	989	/*
	990	* L == Last node of the first part of the split bucket. That part
	991	* remains in the original/src bucket.
	992	* F == First node of the second part of the split bucket. That part
	993	* will be referenced from the dest bucket head.
	994	*
	995	* We want to first mark a clean L as JF so that updaters unaware of
	996	* the split (or table resize):
	997	* - do not insert a new node between L and F
	998	* - do not unlink L (that is why it has to be clean/normal)
	999	* - do not unlink F
	1000	*
	1001	* Then we can safely mark F as JN even if it has been marked deleted.
	1002	* Once F is marked as JN updaters aware of table resize will not
	1003	* attempt to unlink it (JN will have two predecessors - we cannot
	1004	* safely unlink from both at the same time). Updaters unaware of
	1005	* ongoing resize can reach F only via L and that node is already
	1006	* marked JF, so they won't unlink F.
	1007	*
	1008	* Last, link the new/dest head to F.
	1009	*
	1010	*
	1011	* 0) ,-- split_hash, first hash of the dest bucket
	1012	* v
	1013	* [src_head \| N] -> .. -> [L] -> [F]
	1014	* [dest_head \| Inv]
	1015	*
	1016	* 1) ,-- split_hash
	1017	* v
	1018	* [src_head \| N] -> .. -> [JF] -> [F]
	1019	* [dest_head \| Inv]
	1020	*
	1021	* 2) ,-- split_hash
	1022	* v
	1023	* [src_head \| N] -> .. -> [JF] -> [JN]
	1024	* [dest_head \| Inv]
	1025	*
	1026	* 2) ,-- split_hash
	1027	* v
	1028	* [src_head \| N] -> .. -> [JF] -> [JN]
	1029	* ^
	1030	* [dest_head \| N] -----------------'
	1031	*/
	1032	wnd_t wnd;
	1033	bool done;
	1034
[3bb732b]	1035	rcu_read_lock();
	1036
[7ef2249]	1037	/* Mark the last node of the first part of the split bucket as JF. */
	1038	mark_join_follows(h, psrc_head, split_hash, &wnd);
	1039
	1040	/* todo: cas order barrier */
	1041
	1042	/* There are nodes in the dest bucket, ie the second part of the split. */
	1043	if (wnd.cur) {
	1044	/*
	1045	* Mark the first node of the dest bucket as a join node so
	1046	* updaters do not attempt to unlink it if it is deleted.
	1047	*/
	1048	mark_join_node(wnd.cur);
	1049	} else {
	1050	/*
	1051	* Second part of the split bucket is empty. There are no nodes
	1052	* to mark as JOIN nodes and there never will be.
	1053	*/
	1054	}
	1055
	1056	/* Link the dest head to the second part of the split. */
	1057	cas_link(pdest_head, 0, N_INVALID, wnd.cur, N_NORMAL);
[3bb732b]	1058
	1059	rcu_read_unlock();
[7ef2249]	1060	}
	1061
	1062	static void mark_join_follows(cht_t h, marked_ptr_t psrc_head,
	1063	size_t split_hash, wnd_t *wnd)
	1064	{
	1065	/* See comment in split_bucket(). */
	1066
	1067	bool done;
	1068	do {
	1069	bool dummy;
	1070	wnd->ppred = psrc_head;
	1071	wnd->cur = get_next(*psrc_head);
	1072
	1073	/*
	1074	* Find the split window, ie the last node of the first part of
	1075	* the split bucket and the its successor - the first node of
	1076	* the second part of the split bucket. Retry if GC failed.
	1077	*/
	1078	if (!find_wnd_and_gc(h, split_hash, WM_MOVE_JOIN_FOLLOWS, wnd, &dummy))
	1079	continue;
	1080
	1081	/*
	1082	* Mark the last node of the first half of the split bucket
	1083	* that a join node follows. It must be clean/normal.
	1084	*/
	1085	marked_ptr_t ret
	1086	= cas_link(wnd->ppred, wnd->cur, N_NORMAL, wnd->cur, N_JOIN_FOLLOWS);
	1087
	1088	/* Successfully marked as a JF node or already marked that way. */
	1089	done = (ret == make_link(wnd->cur, N_NORMAL))
	1090	\|\| (N_JOIN_FOLLOWS & get_mark(ret));
	1091	} while (!done);
	1092	}
	1093
	1094	static void mark_join_node(cht_link_t *join_node)
	1095	{
	1096	/* See comment in split_bucket(). */
	1097
	1098	bool done;
	1099	do {
	1100	cht_link_t next = get_next(join_node);
	1101	mark_t mark = get_mark(*join_node);
	1102
	1103	/*
	1104	* May already be marked as deleted, but it won't be unlinked
	1105	* because its predecessor is marked with JOIN_FOLLOWS or CONST.
	1106	*/
	1107	marked_ptr_t ret
	1108	= cas_link(&join_node->link, next, mark, next, mark \| N_JOIN);
	1109
	1110	/* Successfully marked or already marked as a join node. */
	1111	done = (ret == make_link(next, mark))
	1112	\|\| (N_JOIN & get_mark(ret));
	1113	} while(!done);
	1114	}
	1115
	1116
	1117	static void join_buckets(cht_t h, marked_ptr_t psrc_head,
[3bb732b]	1118	marked_ptr_t *pdest_head, size_t split_hash)
[7ef2249]	1119	{
	1120	/* Buckets already joined. */
	1121	if (N_INVALID == get_mark(*psrc_head))
	1122	return;
	1123	/*
	1124	* F == First node of psrc_head, ie the bucket we want to append
	1125	* to (ie join with) the bucket starting at pdest_head.
	1126	* L == Last node of pdest_head, ie the bucket that psrc_head will
	1127	* be appended to.
	1128	*
	1129	* (1) We first mark psrc_head immutable to signal that a join is
	1130	* in progress and so that updaters unaware of the join (or table
	1131	* resize):
	1132	* - do not insert new nodes between the head psrc_head and F
	1133	* - do not unlink F (it may already be marked deleted)
	1134	*
	1135	* (2) Next, F is marked as a join node. Updaters aware of table resize
	1136	* will not attempt to unlink it. We cannot safely/atomically unlink
	1137	* the join node because it will be pointed to from two different
	1138	* buckets. Updaters unaware of resize will fail to unlink the join
	1139	* node due to the head being marked immutable.
	1140	*
	1141	* (3) Then the tail of the bucket at pdest_head is linked to the join
	1142	* node. From now on, nodes in both buckets can be found via pdest_head.
	1143	*
	1144	* (4) Last, mark immutable psrc_head as invalid. It signals updaters
	1145	* that the join is complete and they can insert new nodes (originally
	1146	* destined for psrc_head) into pdest_head.
	1147	*
	1148	* Note that pdest_head keeps pointing at the join node. This allows
	1149	* lookups and updaters to determine if they should see a link between
	1150	* the tail L and F when searching for nodes originally in psrc_head
	1151	* via pdest_head. If they reach the tail of pdest_head without
	1152	* encountering any nodes of psrc_head, either there were no nodes
	1153	* in psrc_head to begin with or the link between L and F did not
	1154	* yet propagate to their cpus. If psrc_head was empty, it remains
	1155	* NULL. Otherwise psrc_head points to a join node (it will not be
	1156	* unlinked until table resize completes) and updaters/lookups
	1157	* should issue a read_barrier() to make the link [L]->[JN] visible.
	1158	*
	1159	* 0) ,-- split_hash, first hash of the src bucket
	1160	* v
	1161	* [dest_head \| N]-> .. -> [L]
	1162	* [src_head \| N]--> [F] -> ..
	1163	* ^
	1164	* ` split_hash, first hash of the src bucket
	1165	*
	1166	* 1) ,-- split_hash
	1167	* v
	1168	* [dest_head \| N]-> .. -> [L]
	1169	* [src_head \| C]--> [F] -> ..
	1170	*
	1171	* 2) ,-- split_hash
	1172	* v
	1173	* [dest_head \| N]-> .. -> [L]
	1174	* [src_head \| C]--> [JN] -> ..
	1175	*
	1176	* 3) ,-- split_hash
	1177	* v
	1178	* [dest_head \| N]-> .. -> [L] --+
	1179	* v
	1180	* [src_head \| C]-------------> [JN] -> ..
	1181	*
	1182	* 4) ,-- split_hash
	1183	* v
	1184	* [dest_head \| N]-> .. -> [L] --+
	1185	* v
	1186	* [src_head \| Inv]-----------> [JN] -> ..
	1187	*/
	1188
[3bb732b]	1189	rcu_read_lock();
	1190
[7ef2249]	1191	/* Mark src_head immutable - signals updaters bucket join started. */
	1192	mark_const(psrc_head);
	1193	/* todo: cas order barrier*/
	1194
	1195	cht_link_t join_node = get_next(psrc_head);
	1196
	1197	if (join_node) {
	1198	mark_join_node(join_node);
	1199	/* todo: cas order barrier*/
	1200
	1201	link_to_join_node(h, pdest_head, join_node, split_hash);
	1202	/* todo: cas order barrier*/
	1203	}
	1204
	1205	cas_link(psrc_head, join_node, N_CONST, join_node, N_INVALID);
[3bb732b]	1206
	1207	rcu_read_unlock();
[7ef2249]	1208	}
	1209
	1210	static void link_to_join_node(cht_t h, marked_ptr_t pdest_head,
	1211	cht_link_t *join_node, size_t split_hash)
	1212	{
	1213	bool done;
	1214	do {
	1215	wnd_t wnd = {
	1216	.ppred = pdest_head,
	1217	.cur = get_next(*pdest_head)
	1218	};
	1219
	1220	bool dummy;
	1221
	1222	if (!find_wnd_and_gc(h, split_hash, WM_LEAVE_JOIN, &wnd, &dummy))
	1223	continue;
	1224
	1225	if (wnd.cur) {
	1226	/* Must be from the new appended bucket. */
	1227	ASSERT(split_hash <= node_hash(h, wnd.cur));
	1228	return;
	1229	}
	1230
	1231	/* Reached the tail of pdest_head - link it to the join node. */
	1232	marked_ptr_t ret = cas_link(wnd.ppred, 0, N_NORMAL, join_node, N_NORMAL);
	1233
	1234	done = (ret == make_link(0, N_NORMAL));
	1235	} while (!done);
	1236	}
	1237
	1238	static void free_later(cht_t h, cht_link_t item)
	1239	{
	1240	/*
	1241	* remove_callback only works as rcu_func_t because rcu_link is the first
	1242	* field in cht_link_t.
	1243	*/
	1244	rcu_call(&item->rcu_link, (rcu_func_t)h->op->remove_callback);
	1245
[3bb732b]	1246	item_removed(h);
[7ef2249]	1247	}
	1248
[3bb732b]	1249	static void item_removed(cht_t *h)
[7ef2249]	1250	{
[3bb732b]	1251	/* todo: impl */
[7ef2249]	1252	}
	1253
[3bb732b]	1254	static void item_inserted(cht_t *h)
[7ef2249]	1255	{
[3bb732b]	1256	/* todo: impl */
	1257	}
	1258
	1259	static void resize_table(void *arg)
	1260	{
	1261	cht_t h = (cht_t )arg;
[7ef2249]	1262
[3bb732b]	1263	ASSERT(h->b);
	1264	ASSERT(0 < (read_barrier(), atomic_get(&h->resize_reqs)));
	1265
	1266	/* Load the most current h->item_cnt. */
	1267	read_barrier();
	1268	do {
	1269	size_t cur_items = h->item_cnt;
	1270	size_t bucket_cnt = (1 << h->b->order);
	1271
	1272	if (cur_items >= CHT_MAX_LOAD * bucket_cnt) {
	1273	grow_table(h);
	1274	} else if (cur_items <= CHT_MAX_LOAD * bucket_cnt / 4) {
	1275	shrink_table(h);
	1276	}
	1277
	1278	/* Load the most current h->item_cnt and h->resize_reqs. */
	1279	read_barrier();
	1280	} while (0 < atomic_predec(&h->resize_reqs));
	1281	}
	1282
	1283	static void grow_table(cht_t *h)
	1284	{
	1285	if (h->b->order >= CHT_MAX_ORDER)
	1286	return;
[7ef2249]	1287
[3bb732b]	1288	h->new_b = alloc_buckets(h->b->order + 1, true);
	1289
	1290	/* Failed to alloc a new table - try next time the resizer is run. */
	1291	if (!h->new_b)
	1292	return;
	1293
	1294	/* Wait for all readers and updaters to see the initialized new table. */
	1295	rcu_synchronize();
[7ef2249]	1296
[3bb732b]	1297	size_t old_bucket_cnt = (1 << h->b->order);
[7ef2249]	1298
[3bb732b]	1299	/*
	1300	* Give updaters a chance to help out with the resize. Do the minimum
	1301	* work needed to announce a resize is in progress, ie start moving heads.
	1302	*/
	1303	for (size_t idx = 0; idx < old_bucket_cnt; ++idx) {
	1304	start_head_move(&h->b->head[idx]);
[7ef2249]	1305	}
	1306
[3bb732b]	1307	/* Complete moving heads and split any buckets not yet split by updaters. */
	1308	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
	1309	marked_ptr_t *move_dest_head = &h->new_b->head[grow_idx(old_idx)];
	1310	marked_ptr_t *move_src_head = &h->b->head[old_idx];
	1311
	1312	/* Head move not yet completed. */
	1313	if (N_INVALID != get_mark(*move_src_head)) {
	1314	complete_head_move(move_src_head, move_dest_head);
	1315	}
	1316
	1317	size_t split_idx = grow_to_split_idx(old_idx);
	1318	size_t split_hash = calc_split_hash(split_idx, h->new_b->order);
	1319	marked_ptr_t *split_dest_head = &h->new_b->head[split_idx];
	1320
	1321	split_bucket(h, move_dest_head, split_dest_head, split_hash);
	1322	}
	1323
	1324	/*
	1325	* Wait for all updaters to notice the new heads. Once everyone sees
	1326	* the invalid old bucket heads they will know a resize is in progress
	1327	* and updaters will modify the correct new buckets.
	1328	*/
	1329	rcu_synchronize();
	1330
	1331	/* Clear the JOIN_FOLLOWS mark and remove the link between the split buckets.*/
	1332	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
	1333	size_t new_idx = grow_idx(old_idx);
	1334
	1335	cleanup_join_follows(h, &h->new_b[new_idx]);
	1336	}
	1337
	1338	/*
	1339	* Wait for everyone to notice that buckets were split, ie link connecting
	1340	* the join follows and join node has been cut.
	1341	*/
	1342	rcu_synchronize();
	1343
	1344	/* Clear the JOIN mark and GC any deleted join nodes. */
	1345	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
	1346	size_t new_idx = grow_to_split_idx(old_idx);
	1347
	1348	cleanup_join_node(h, &h->new_b[new_idx]);
	1349	}
	1350
	1351	/* Wait for everyone to see that the table is clear of any resize marks. */
	1352	rcu_synchronize();
	1353
	1354	cht_buckets_t *old_b = h->b;
	1355	rcu_assign(h->b, h->new_b);
	1356
	1357	/* Wait for everyone to start using the new table. */
	1358	rcu_synchronize();
	1359
	1360	free(old_b);
	1361
	1362	/* Not needed; just for increased readability. */
	1363	h->new_b = 0;
	1364	}
	1365
	1366	static void shrink_table(cht_t *h)
	1367	{
	1368	if (h->b->order <= CHT_MIN_ORDER)
	1369	return;
	1370
	1371	h->new_b = alloc_buckets(h->b->order - 1, true);
	1372
	1373	/* Failed to alloc a new table - try next time the resizer is run. */
	1374	if (!h->new_b)
	1375	return;
	1376
	1377	/* Wait for all readers and updaters to see the initialized new table. */
	1378	rcu_synchronize();
	1379
	1380	size_t old_bucket_cnt = (1 << h->b->order);
	1381
	1382	/*
	1383	* Give updaters a chance to help out with the resize. Do the minimum
	1384	* work needed to announce a resize is in progress, ie start moving heads.
	1385	*/
	1386	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
	1387	size_t new_idx = shrink_idx(old_idx);
	1388
	1389	/* This bucket should be moved. */
	1390	if (grow_idx(new_idx) == old_idx) {
	1391	start_head_move(&h->b->head[old_idx]);
	1392	} else {
	1393	/* This bucket should join the moved bucket once the move is done.*/
	1394	}
	1395	}
	1396
	1397	/* Complete moving heads and join buckets with the moved buckets. */
	1398	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
	1399	size_t new_idx = shrink_idx(old_idx);
	1400
	1401	/* This bucket should be moved. */
	1402	if (grow_idx(new_idx) == old_idx) {
	1403	/* Head move not yet completed. */
	1404	if (N_INVALID != get_mark(h->b->head[old_idx])) {
	1405	complete_head_move(&h->b->head[old_idx], &h->new_b->head[new_idx]);
	1406	}
	1407	} else {
	1408	/* This bucket should join the moved bucket. */
	1409	size_t split_hash = calc_split_hash(old_idx, h->b->order);
	1410	join_buckets(h, &h->b->head[old_idx], &h->new_b->head[new_idx],
	1411	split_hash);
	1412	}
	1413	}
	1414
	1415	/*
	1416	* Wait for all updaters to notice the new heads. Once everyone sees
	1417	* the invalid old bucket heads they will know a resize is in progress
	1418	* and updaters will modify the correct new buckets.
	1419	*/
	1420	rcu_synchronize();
	1421
	1422	/* Let everyone know joins are complete and fully visible. */
	1423	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
	1424	size_t move_src_idx = grow_idx(shrink_idx(old_idx));
	1425
	1426	/* Set the invalid joinee head to NULL. */
	1427	if (old_idx != move_src_idx) {
	1428	ASSERT(N_INVALID == h->b->head[old_idx]);
	1429
	1430	if (0 != get_next(h->b->head[old_idx]))
	1431	h->b->head[old_idx] = make_link(0, N_INVALID);
	1432	}
	1433	}
	1434
	1435	/* todo comment join node vs reset joinee head*/
	1436	rcu_synchronize();
	1437
	1438	size_t new_bucket_cnt = (1 << h->new_b->order);
	1439
	1440	/* Clear the JOIN mark and GC any deleted join nodes. */
	1441	for (size_t new_idx = 0; new_idx < new_bucket_cnt; ++new_idx) {
	1442	cleanup_join_node(h, &h->new_b[new_idx]);
	1443	}
	1444
	1445	/* Wait for everyone to see that the table is clear of any resize marks. */
	1446	rcu_synchronize();
	1447
	1448	cht_buckets_t *old_b = h->b;
	1449	rcu_assign(h->b, h->new_b);
	1450
	1451	/* Wait for everyone to start using the new table. */
	1452	rcu_synchronize();
	1453
	1454	free(old_b);
	1455
	1456	/* Not needed; just for increased readability. */
	1457	h->new_b = 0;
	1458	}
	1459
	1460	static void cleanup_join_node(cht_t h, marked_ptr_t new_head)
	1461	{
	1462	rcu_read_lock();
	1463
	1464	cht_link_t cur = get_next(new_head);
	1465
	1466	while (cur) {
	1467	/* Clear the join node's JN mark - even if it is marked as deleted. */
	1468	if (N_JOIN & get_mark(cur->link)) {
	1469	clear_join_and_gc(h, cur, new_head);
	1470	break;
	1471	}
	1472
	1473	cur = get_next(cur->link);
	1474	}
	1475
	1476	rcu_read_unlock();
	1477	}
	1478
	1479	static void clear_join_and_gc(cht_t h, cht_link_t join_node,
	1480	marked_ptr_t *new_head)
	1481	{
	1482	ASSERT(join_node && (N_JOIN & get_mark(join_node->link)));
	1483
	1484	bool done;
	1485
	1486	/* Clear the JN mark. */
	1487	do {
	1488	marked_ptr_t jn_link = join_node->link;
	1489	cht_link_t *next = get_next(jn_link);
	1490	/* Clear the JOIN mark but keep the DEL mark if present. */
	1491	mark_t cleared_mark = get_mark(jn_link) & N_DELETED;
	1492
	1493	marked_ptr_t ret =
	1494	_cas_link(&join_node->link, jn_link, make_link(next, cleared_mark));
	1495
	1496	/* Done if the mark was cleared. Retry if a new node was inserted. */
	1497	done = (ret == jn_link);
	1498	} while (!done);
	1499
	1500	if (!(N_DELETED & get_mark(join_node->link)))
	1501	return;
	1502
	1503	/* The join node had been marked as deleted - GC it. */
	1504
	1505	size_t jn_hash = node_hash(h, join_node);
	1506	do {
	1507	bool resizing;
	1508
	1509	wnd_t wnd = {
	1510	.ppred = new_head,
	1511	.cur = get_next(*new_head)
	1512	};
	1513
	1514	done = find_wnd_and_gc_pred(h, jn_hash, WM_NORMAL, same_node_pred,
	1515	join_node, &wnd, &resizing);
	1516
	1517	ASSERT(!resizing);
	1518	} while (!done);
	1519	}
	1520
	1521	static void cleanup_join_follows(cht_t h, marked_ptr_t new_head)
	1522	{
	1523	ASSERT(new_head);
	1524
	1525	rcu_read_lock();
	1526
	1527	wnd_t wnd = {
	1528	.ppred = 0,
	1529	.cur = 0
	1530	};
	1531	marked_ptr_t *cur_link = new_head;
	1532
	1533	/*
	1534	* Find the non-deleted node with a JF mark and clear the JF mark.
	1535	* The JF node may be deleted and/or the mark moved to its neighbors
	1536	* at any time. Therefore, we GC deleted nodes until we find the JF
	1537	* node in order to remove stale/deleted JF nodes left behind eg by
	1538	* delayed threads that did not yet get a chance to unlink the deleted
	1539	* JF node and move its mark.
	1540	*
	1541	* Note that the head may be marked JF (but never DELETED).
	1542	*/
	1543	while (true) {
	1544	bool is_jf_node = N_JOIN_FOLLOWS & get_mark(*cur_link);
	1545
	1546	/* GC any deleted nodes on the way - even deleted JOIN_FOLLOWS. */
	1547	if (N_DELETED & get_mark(*cur_link)) {
	1548	ASSERT(cur_link != new_head);
	1549	ASSERT(wnd.ppred && wnd.cur);
	1550	ASSERT(cur_link == &wnd.cur->link);
	1551
	1552	bool dummy;
	1553	bool deleted = gc_deleted_node(h, WM_MOVE_JOIN_FOLLOWS, &wnd, &dummy);
	1554
	1555	/* Failed to GC or collected a deleted JOIN_FOLLOWS. */
	1556	if (!deleted \|\| is_jf_node) {
	1557	/* Retry from the head of the bucket. */
	1558	cur_link = new_head;
	1559	continue;
	1560	}
	1561	} else {
	1562	/* Found a non-deleted JF. Clear its JF mark. */
	1563	if (is_jf_node) {
	1564	cht_link_t next = get_next(cur_link);
	1565	marked_ptr_t ret
	1566	= cas_link(cur_link, next, N_JOIN_FOLLOWS, 0, N_NORMAL);
	1567
	1568	/* Successfully cleared the JF mark of a non-deleted node. */
	1569	if (ret == make_link(next, N_JOIN_FOLLOWS)) {
	1570	break;
	1571	} else {
	1572	/*
	1573	* The JF node had been deleted or a new node inserted
	1574	* right after it. Retry from the head.
	1575	*/
	1576	cur_link = new_head;
	1577	continue;
	1578	}
	1579	} else {
	1580	wnd.ppred = cur_link;
	1581	wnd.cur = get_next(*cur_link);
	1582	}
	1583	}
	1584
	1585	/* We must encounter a JF node before we reach the end of the bucket. */
	1586	ASSERT(wnd.cur);
	1587	cur_link = &wnd.cur->link;
	1588	}
	1589
	1590	rcu_read_unlock();
	1591	}
	1592
	1593
	1594	static size_t calc_split_hash(size_t split_idx, size_t order)
	1595	{
	1596	ASSERT(1 <= order && order <= 8 * sizeof(size_t));
	1597	return split_idx << (8 * sizeof(size_t) - order);
	1598	}
	1599
	1600	static size_t calc_bucket_idx(size_t hash, size_t order)
	1601	{
	1602	ASSERT(1 <= order && order <= 8 * sizeof(size_t));
	1603	return hash >> (8 * sizeof(size_t) - order);
	1604	}
	1605
	1606	static size_t grow_to_split_idx(size_t old_idx)
	1607	{
	1608	return grow_idx(old_idx) \| 1;
	1609	}
	1610
	1611	static size_t grow_idx(size_t idx)
	1612	{
	1613	return idx << 1;
	1614	}
	1615
	1616	static size_t shrink_idx(size_t idx)
	1617	{
	1618	return idx >> 1;
[7ef2249]	1619	}
	1620
	1621
	1622	static size_t key_hash(cht_t h, void key)
	1623	{
	1624	return hash_mix(h->op->key_hash(key));
	1625	}
	1626
	1627	static size_t node_hash(cht_t h, const cht_link_t item)
	1628	{
	1629	return hash_mix(h->op->hash(item));
	1630	}
	1631
	1632
[3bb732b]	1633	static marked_ptr_t make_link(const cht_link_t *next, mark_t mark)
[7ef2249]	1634	{
	1635	marked_ptr_t ptr = (marked_ptr_t) next;
	1636
	1637	ASSERT(!(ptr & N_MARK_MASK));
	1638	ASSERT(!((unsigned)mark & ~N_MARK_MASK));
	1639
	1640	return ptr \| mark;
	1641	}
	1642
	1643
	1644	static cht_link_t * get_next(marked_ptr_t link)
	1645	{
	1646	return (cht_link_t*)(link & ~N_MARK_MASK);
	1647	}
	1648
	1649
	1650	static mark_t get_mark(marked_ptr_t link)
	1651	{
	1652	return (mark_t)(link & N_MARK_MASK);
	1653	}
	1654
	1655
	1656	static void next_wnd(wnd_t *wnd)
	1657	{
	1658	ASSERT(wnd);
	1659	ASSERT(wnd->cur);
	1660
	1661	wnd->last = wnd->cur;
	1662	wnd->ppred = &wnd->cur->link;
	1663	wnd->cur = get_next(wnd->cur->link);
	1664	}
	1665
	1666
	1667	static bool same_node_pred(void node, const cht_link_t item2)
	1668	{
	1669	const cht_link_t item1 = (const cht_link_t) node;
	1670	return item1 == item2;
	1671	}
	1672
[3bb732b]	1673	static marked_ptr_t cas_link(marked_ptr_t link, const cht_link_t cur_next,
	1674	mark_t cur_mark, const cht_link_t *new_next, mark_t new_mark)
[7ef2249]	1675	{
	1676	return _cas_link(link, make_link(cur_next, cur_mark),
	1677	make_link(new_next, new_mark));
	1678	}
	1679
	1680	static marked_ptr_t _cas_link(marked_ptr_t *link, marked_ptr_t cur,
	1681	marked_ptr_t new)
	1682	{
[3bb732b]	1683	/*
	1684	* cas(x) on the same location x on one cpu must be ordered, but do not
	1685	* have to be ordered wrt to other cas(y) to a different location y
	1686	* on the same cpu.
	1687	*
	1688	* cas(x) must act as a write barrier on x, ie if cas(x) succeeds
	1689	* and is observed by another cpu, then all cpus must be able to
	1690	* make the effects of cas(x) visible just by issuing a load barrier.
	1691	* For example:
	1692	* cpu1 cpu2 cpu3
	1693	* cas(x, 0 -> 1), succeeds
	1694	* cas(x, 0 -> 1), fails
	1695	* MB
	1696	* y = 7
	1697	* sees y == 7
	1698	* loadMB must be enough to make cas(x) on cpu3 visible to cpu1, ie x == 1.
	1699	*
	1700	* If cas() did not work this way:
	1701	* - our head move protocol would not be correct.
	1702	* - freeing an item linked to a moved head after another item was
	1703	* inserted in front of it, would require more than one grace period.
	1704	*/
[7ef2249]	1705	}
	1706
	1707	/** @}
	1708	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: