Context Navigation

source: mainline/kernel/generic/src/adt/cht.c@ 3bb732b

Visit:

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export

Last change on this file since 3bb732b was 3bb732b, checked in by Adam Hraska <adam.hraska+hos@…>, 13 years ago
cht: Implemented insert, resize. Heavy work in progress. Excluded from build.
Property mode set to `100644`
File size: 47.1 KB

Line
1	/*
2	* Copyright (c) 2012 Adam Hraska
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* - Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* - Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* - The name of the author may not be used to endorse or promote products
15	* derived from this software without specific prior written permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/** @addtogroup genericadt
30	* @{
31	*/
32
33	/**
34	* @file
35	* @brief Concurrent resizable lock-free hash table.
36	*
37	*/
38
39	#include <adt/cht.h>
40	#include <debug.h>
41	#include <memstr.h>
42	#include <mm/slab.h>
43	#include <barrier.h>
44	#include <compiler/barrier.h>
45	#include <atomic.h>
46	#include <synch/rcu.h>
47
48	/* Logarithm of the min bucket count. */
49	#define CHT_MIN_ORDER 6
50	/* Logarithm of the max bucket count. */
51	#define CHT_MAX_ORDER (8 * sizeof(size_t))
52	/* Minimum number of hash table buckets. */
53	#define CHT_MIN_BUCKET_CNT (1 << CHT_MIN_ORDER)
54	/* Must be a power of 2. */
55	#define CHT_MAX_LOAD 2
56
57	typedef cht_ptr_t marked_ptr_t;
58	typedef bool (equal_pred_t)(void arg, const cht_link_t *item);
59
60	typedef enum mark {
61	N_NORMAL = 0,
62	N_DELETED = 1,
63	N_INVALID = 1,
64	N_CONST = 3,
65	N_JOIN = 2,
66	N_JOIN_FOLLOWS = 2,
67	N_MARK_MASK = 3
68	} mark_t;
69
70	typedef enum walk_mode {
71	WM_NORMAL = 4,
72	WM_LEAVE_JOIN,
73	WM_MOVE_JOIN_FOLLOWS
74	} walk_mode_t;
75
76	typedef struct wnd {
77	marked_ptr_t *ppred;
78	cht_link_t *cur;
79	cht_link_t *last;
80	} wnd_t;
81
82
83	static size_t size_to_order(size_t bucket_cnt);
84	static cht_buckets_t *alloc_buckets(size_t order);
85
86	static marked_ptr_t make_link(cht_link_t *next, mark_t mark);
87	static cht_link_t * get_next(marked_ptr_t link);
88	static mark_t get_mark(marked_ptr_t link);
89
90	static size_t key_hash(cht_t h, void key);
91	static size_t node_hash(cht_t h, const cht_link_t item);
92
93	static size_t calc_split_hash(size_t split_idx, size_t order);
94	static size_t calc_bucket_idx(size_t hash, size_t order);
95	static size_t grow_idx(size_t idx);
96	static size_t shrink_idx(size_t idx);
97
98
99
100	bool cht_create(cht_t h, size_t init_size, cht_ops_t op)
101	{
102	ASSERT(h);
103	ASSERT(op && op->hash && op->key_hash && op->equal && op->key_equal);
104
105	/* All operations are compulsory. */
106	if (!op \|\| !op->hash \|\| !op->key_hash \|\| !op->equal \|\| !op->key_equal)
107	return false;
108
109	size_t order = size_to_order(init_size);
110
111	h->b = alloc_buckets(order);
112
113	if (!h->b)
114	return false;
115
116	h->new_b = 0;
117	h->op = op;
118	atomic_set(&h->item_cnt, 0);
119	atomic_set(&h->resize_reqs, 0);
120	/* Ensure the initialization takes place before we start using the table. */
121	write_barrier();
122
123	return true;
124	}
125
126	static cht_buckets_t *alloc_buckets(size_t order, bool set_invalid)
127	{
128	size_t bucket_cnt = (1 << order);
129	cht_buckets_t *b = malloc(
130	sizeof(cht_buckets_t) + (bucket_cnt - 1) * sizeof(marked_ptr_t));
131
132	if (!b)
133	return 0;
134
135	b->order = order;
136
137	marked_ptr_t head_link
138	= set_invalid ? make_link(0, N_INVALID) : make_link(0, N_NORMAL);
139
140	for (size_t i = 0; i < bucket_cnt; ++i) {
141	b->head[i] = head_link;
142	}
143
144	return b;
145	}
146
147	static size_t size_to_order(size_t bucket_cnt)
148	{
149	size_t order = CHT_MIN_ORDER;
150
151	/* Find a power of two such that bucket_cnt <= 2^order */
152	do {
153	if (bucket_cnt <= (1 << order))
154	return order;
155
156	++order;
157	} while (order < CHT_MAX_ORDER);
158
159	return order;
160	}
161
162
163	void cht_destroy(cht_t *h)
164	{
165	/* todo: impl */
166	}
167
168	cht_link_t cht_find(cht_t h, void *key)
169	{
170	/* Make the most recent changes of the table visible. */
171	read_barrier();
172	return cht_find_lazy(h, key);
173	}
174
175
176	cht_link_t cht_find_lazy(cht_t h, void *key)
177	{
178	ASSERT(h);
179	ASSERT(rcu_read_locked());
180
181	size_t hash = key_hash(h, key);
182
183	cht_buckets_t *b = rcu_access(h->b);
184	size_t idx = calc_bucket_idx(hash, b->order);
185	/*
186	* No need for access_once. b->head[idx] will point to an allocated node
187	* even if marked invalid until we exit rcu read section.
188	*/
189	marked_ptr_t head = b->head[idx];
190
191	if (N_INVALID == get_mark(head))
192	return find_resizing(h, key, hash, head, idx);
193
194	return search_bucket(h, head, key, hash);
195	}
196
197
198	static cht_link_t search_bucket(cht_t h, marked_ptr_t head, void *key,
199	size_t search_hash)
200	{
201	cht_link_t *cur = get_next(head);
202
203	while (cur) {
204	/*
205	* It is safe to access nodes even outside of this bucket (eg when
206	* splitting the bucket). The resizer makes sure that any node we
207	* may find by following the next pointers is allocated.
208	*/
209	size_t cur_hash = node_hash(cur);
210
211	if (cur_hash >= search_hash) {
212	if (cur_hash != search_hash)
213	return 0;
214
215	int present = !(N_DELETED & get_mark(cur->link));
216	if (present && h->op->key_equal(key, cur))
217	return cur;
218	}
219
220	cur = get_next(cur->link);
221	}
222
223	return 0;
224	}
225
226	static cht_link_t find_resizing(cht_t h, void *key, size_t hash,
227	marked_ptr_t old_head, size_t old_idx)
228	{
229	ASSERT(N_INVALID == get_mark(old_head));
230	ASSERT(h->new_b);
231
232	size_t new_idx = calc_bucket_idx(hash, h->new_b->order);
233	marked_ptr_t new_head = h->new_b->head[new_idx];
234	marked_ptr_t search_head = new_head;
235
236	/* Growing. */
237	if (h->b->order < h->new_b->order) {
238	/*
239	* Old bucket head is invalid, so it must have been already
240	* moved. Make the new head visible if still not visible, ie
241	* invalid.
242	*/
243	if (N_INVALID == get_mark(new_head)) {
244	/*
245	* We should be searching a newly added bucket but the old
246	* moved bucket has not yet been split (its marked invalid)
247	* or we have not yet seen the split.
248	*/
249	if (grow_idx(old_idx) != new_idx) {
250	/*
251	* Search the moved bucket. It is guaranteed to contain
252	* items of the newly added bucket that were present
253	* before the moved bucket was split.
254	*/
255	new_head = h->new_b->head[grow_idx(old_idx)];
256	}
257
258	/* new_head is now the moved bucket, either valid or invalid. */
259
260	/*
261	* The old bucket was definitely moved to new_head but the
262	* change of new_head had not yet propagated to this cpu.
263	*/
264	if (N_INVALID == get_mark(new_head)) {
265	/*
266	* We could issue a read_barrier() and make the now valid
267	* moved bucket head new_head visible, but instead fall back
268	* on using the old bucket. Although the old bucket head is
269	* invalid, it points to a node that is allocated and in the
270	* right bucket. Before the node can be freed, it must be
271	* unlinked from the head (or another item after that item
272	* modified the new_head) and a grace period must elapse.
273	* As a result had the node been already freed the grace
274	* period preceeding the free() would make the unlink and
275	* any changes to new_head visible. Therefore, it is safe
276	* to use the node pointed to from the old bucket head.
277	*/
278
279	search_head = old_head;
280	} else {
281	search_head = new_head;
282	}
283	}
284
285	return search_bucket(h, search_head, key, hash);
286	} else if (h->b->order > h->new_b->order) {
287	/* Shrinking. */
288
289	/* Index of the bucket in the old table that was moved. */
290	size_t move_src_idx = grow_idx(new_idx);
291	marked_ptr_t moved_old_head = h->b->head[move_src_idx];
292
293	/*
294	* h->b->head[move_src_idx] had already been moved to new_head
295	* but the change to new_head had not yet propagated to us.
296	*/
297	if (N_INVALID == get_mark(new_head)) {
298	/*
299	* new_head is definitely valid and we could make it visible
300	* to this cpu with a read_barrier(). Instead, use the bucket
301	* in the old table that was moved even though it is now marked
302	* as invalid. The node it points to must be allocated because
303	* a grace period would have to elapse before it could be freed;
304	* and the grace period would make the now valid new_head
305	* visible to all cpus.
306	*
307	* Note that move_src_idx may not be the same as old_idx.
308	* If move_src_idx != old_idx then old_idx is the bucket
309	* in the old table that is not moved but instead it is
310	* appended to the moved bucket, ie it is added at the tail
311	* of new_head. In that case an invalid old_head notes that
312	* it had already been merged into (the moved) new_head.
313	* We will try to search that bucket first because it
314	* may contain some newly added nodes after the bucket
315	* join. Moreover, the bucket joining link may already be
316	* visible even if new_head is not. Therefore, if we're
317	* lucky we'll find the item via moved_old_head. In any
318	* case, we'll retry in proper old_head if not found.
319	*/
320	search_head = moved_old_head;
321	}
322
323	cht_link_t *ret = search_bucket(h, search_head, key, hash);
324
325	if (ret)
326	return ret;
327	/*
328	* Bucket old_head was already joined with moved_old_head
329	* in the new table but we have not yet seen change of the
330	* joining link (or the item is not in the table).
331	*/
332	if (move_src_idx != old_idx && get_next(old_head)) {
333	/*
334	* Note that old_head (the bucket to be merged into new_head)
335	* points to an allocated join node (if non-null) even if marked
336	* invalid. Before the resizer lets join nodes to be unlinked
337	* (and freed) it sets old_head to 0 and waits for a grace period.
338	* So either the invalid old_head points to join node; or old_head
339	* is null and we would have seen a completed bucket join while
340	* traversing search_head.
341	*/
342	ASSERT(N_JOIN & get_mark(get_next(old_head)->link));
343	return search_bucket(h, old_head, key, hash);
344	}
345
346	return 0;
347	} else {
348	/*
349	* Resize is almost done. The resizer is waiting to make
350	* sure all cpus see that the new table replaced the old one.
351	*/
352	ASSERT(h->b->order == h->new_b->order);
353	/*
354	* The resizer must ensure all new bucket heads are visible before
355	* replacing the old table.
356	*/
357	ASSERT(N_NORMAL == get_mark(new_head));
358	return search_bucket(h, new_head, key, hash);
359	}
360	}
361
362
363	void cht_insert(cht_t h, cht_link_t item)
364	{
365	return insert_impl(h, item, true);
366	}
367
368	bool cht_insert_unique(cht_t h, cht_link_t item)
369	{
370	insert_impl(h, item, false);
371	}
372
373	bool insert_impl(cht_t h, cht_link_t item, bool unique)
374	{
375	rcu_read_lock();
376
377	cht_buckets_t *b = rcu_access(h->b);
378	size_t hash = node_hash(h, item);
379	size_t idx = calc_bucket_idx(hash, b->order);
380	marked_ptr_t *phead = &b->head[idx];
381
382	bool resizing = false;
383	bool inserted;
384
385	do {
386	walk_mode_t walk_mode = WM_NORMAL;
387	bool join_finishing;
388
389	resizing = resizing \|\| (N_NORMAL != get_mark(*phead));
390
391	/* The table is resizing. Get the correct bucket head. */
392	if (resizing) {
393	upd_resizing_head(hash, &phead, &join_finishing, &walk_mode);
394	}
395
396	wnd_t wnd = {
397	.ppred = phead,
398	.cur = get_next(*phead),
399	.last = 0
400	};
401
402	if (!find_wnd_and_gc(h, hash, walk_mode, &wnd, &resizing)) {
403	/* Could not GC a node; or detected an unexpected resize. */
404	continue;
405	}
406
407	if (unique && has_duplicates(h, item, hash, wnd)) {
408	rcu_read_unlock();
409	return false;
410	}
411
412	inserted = insert_at(item, wnd, walk_mode, &resizing);
413	} while (!inserted);
414
415	item_inserted(h);
416
417	rcu_read_unlock();
418	return true;
419	}
420
421	static bool insert_at(cht_link_t item, const wnd_t wnd, walk_mode_t walk_mode,
422	bool *resizing)
423	{
424	marked_ptr_t ret;
425
426	if (walk_mode == WM_NORMAL) {
427	item->link = make_link(wnd->cur, N_NORMAL);
428	/* Initialize the item before adding it to a bucket. */
429	memory_barrier();
430
431	/* Link a clean/normal predecessor to the item. */
432	ret = cas_link(wnd->ppred, wnd->cur, N_NORMAL, item, N_NORMAL);
433
434	if (ret == make_link(wnd->cur, N_NORMAL)) {
435	return true;
436	} else {
437	*resizing = ((N_JOIN_FOLLOWS \| N_JOIN) & get_mark(ret));
438	return false;
439	}
440	} else if (walk_mode == WM_MOVE_JOIN_FOLLOWS) {
441	/* Move JOIN_FOLLOWS mark but filter out the DELETED mark. */
442	mark_t jf_mark = get_mark(*wnd->ppred) & N_JOIN_FOLLOWS;
443	item->link = make_link(wnd->cur, jf_mark);
444	/* Initialize the item before adding it to a bucket. */
445	memory_barrier();
446
447	/* Link the not-deleted predecessor to the item. Move its JF mark. */
448	ret = cas_link(wnd->ppred, wnd->cur, jf_mark, item, N_NORMAL);
449
450	return ret == make_link(wnd->cur, jf_mark);
451	} else {
452	ASSERT(walk_mode == WM_LEAVE_JOIN);
453
454	item->link = make_link(wnd->cur, N_NORMAL);
455	/* Initialize the item before adding it to a bucket. */
456	memory_barrier();
457
458	mark_t pred_mark = get_mark(*wnd->ppred);
459	/* If the predecessor is a join node it may be marked deleted.*/
460	mark_t exp_pred_mark = (N_JOIN & pred_mark) ? pred_mark : N_NORMAL;
461
462	ret = cas_link(wnd->ppred, wnd->cur, exp_pred_mark, item, exp_pred_mark);
463	return ret == make_link(wnd->cur, exp_pred_mark);
464	}
465	}
466
467	static bool has_duplicates(cht_t h, cht_link_t item, size_t hash,
468	const wnd_t *cwnd)
469	{
470	ASSERT(0 == wnd->cur \|\| hash <= node_hash(h, wnd->cur));
471
472	if (0 == wnd->cur \|\| hash < node_hash(h, wnd->cur))
473	return false;
474
475	/*
476	* Load the most recent node marks. Otherwise we might pronounce a
477	* logically deleted node for a duplicate of the item just because
478	* the deleted node's DEL mark had not yet propagated to this cpu.
479	*/
480	read_barrier();
481
482	cht_link_t *cur = wnd->cur;
483
484	do {
485	bool deleted = (N_DELETED & get_mark(cur->link));
486
487	/* Skip logically deleted nodes. */
488	if (!deleted && h->op->equal(item, cur))
489	return true;
490
491	cur = get_next(cur->link);
492	} while (cur && node_hash(h, cur) == hash);
493
494	return false;
495	}
496
497
498	size_t cht_remove_key(cht_t h, void key)
499	{
500	ASSERT(h);
501
502	size_t hash = key_hash(h, key);
503	size_t removed = 0;
504
505	while (remove_pred(h, hash, h->op->key_equal, key))
506	++removed;
507
508	return removed;
509	}
510
511	bool cht_remove_item(cht_t h, cht_link_t item)
512	{
513	ASSERT(h);
514	ASSERT(item);
515
516	/*
517	* Even though we know the node we want to delete we must unlink it
518	* from the correct bucket and from a clean/normal predecessor. Therefore,
519	* we search for it again from the beginning of the correct bucket.
520	*/
521	size_t hash = node_hash(h, item);
522	return remove_pred(h, hash, same_node_pred, item);
523	}
524
525
526	static bool remove_pred(cht_t h, size_t hash, equal_pred_t pred, void pred_arg)
527	{
528	rcu_read_lock();
529
530	bool resizing = false;
531	bool deleted = false;
532	bool deleted_but_gc = false;
533
534	cht_buckets_t *b = rcu_access(h->b);
535	size_t idx = calc_bucket_idx(hash, b->order);
536	marked_ptr_t *phead = &b->head[idx];
537
538	do {
539	walk_mode_t walk_mode = WM_NORMAL;
540	bool join_finishing = false;
541
542	resizing = resizing \|\| (N_NORMAL != get_mark(*phead));
543
544	/* The table is resizing. Get the correct bucket head. */
545	if (resizing) {
546	upd_resizing_head(hash, &phead, &join_finishing, &walk_mode);
547	}
548
549	wnd_t wnd = {
550	.ppred = phead,
551	.cur = get_next(*phead),
552	.last = 0
553	};
554
555	if (!find_wnd_and_gc_pred(
556	h, hash, walk_mode, pred, pred_arg, &wnd, &resizing)) {
557	/* Could not GC a node; or detected an unexpected resize. */
558	continue;
559	}
560
561	/*
562	* The item lookup is affected by a bucket join but effects of
563	* the bucket join have not been seen while searching for the item.
564	*/
565	if (join_finishing && !join_completed(h, &wnd)) {
566	/*
567	* Bucket was appended at the end of another but the next
568	* ptr linking them together was not visible on this cpu.
569	* join_completed() makes this appended bucket visible.
570	*/
571	continue;
572	}
573
574	/* Already deleted, but delete_at() requested one GC pass. */
575	if (deleted_but_gc)
576	break;
577
578	bool found = wnd.cur && pred(pred_arg, wnd.cur);
579
580	if (!found) {
581	rcu_read_unlock();
582	return false;
583	}
584
585	deleted = delete_at(wnd, walk_mode, &deleted_but_gc, &resizing);
586	} while (!deleted \|\| deleted_but_gc);
587
588	rcu_read_unlock();
589	return true;
590	}
591
592
593	static bool delete_at(cht_t h, wnd_t wnd, walk_mode_t walk_mode,
594	bool deleted_but_gc, bool resizing)
595	{
596	ASSERT(wnd->cur);
597
598	*deleted_but_gc = false;
599
600	if (!mark_deleted(wnd->cur, walk_mode, resizing)) {
601	/* Already deleted, or unexpectedly marked as JOIN/JOIN_FOLLOWS. */
602	return false;
603	}
604
605	/* Marked deleted. Unlink from the bucket. */
606
607	/* Never unlink join nodes. */
608	if (walk_mode == WM_LEAVE_JOIN && (N_JOIN & get_mark(wnd->cur->link)))
609	return true;
610
611	if (unlink_from_pred(wnd, walk_mode, resizing)) {
612	free_later(h, wnd->cur);
613	} else {
614	*deleted_but_gc = true;
615	}
616
617	return true;
618	}
619
620	static bool mark_deleted(cht_link_t cur, walk_mode_t walk_mode, bool resizing)
621	{
622	ASSERT(cur);
623
624	/*
625	* Btw, we could loop here if the cas fails but let's not complicate
626	* things and let's retry from the head of the bucket.
627	*/
628
629	cht_link_t *next = get_next(cur->link);
630
631	if (walk_mode == WM_NORMAL) {
632	/* Only mark clean/normal nodes - JF/JN is used only during resize. */
633	marked_ptr_t normal_link = make_link(next, N_NORMAL);
634	marked_ptr_t del_link = make_link(next, N_DELETED);
635
636	marked_ptr_t ret = cas_link(&cur->link, normal_link, del_link);
637
638	if (normal_link != ret) {
639	*resizing = (N_JOIN \| N_JOIN_FOLLOWS \| N_INVALID) & get_mark(ret);
640	return false;
641	}
642	} else {
643	ASSERT(N_JOIN == N_JOIN_FOLLOWS);
644
645	/* Keep the N_JOIN/N_JOIN_FOLLOWS mark but strip N_DELETED. */
646	mark_t cur_mark = get_mark(cur->link) & N_JOIN_FOLLOWS;
647
648	marked_ptr_t nondel_link = make_link(next, cur_mark);
649	marked_ptr_t del_link = make_link(next, cur_mark \| N_DELETED);
650
651	if (nondel_link != cas_link(&cur->link, nondel_link, del_link))
652	return false;
653	}
654
655	return true;
656	}
657
658	static bool unlink_from_pred(wnd_t wnd, walk_mode_t walk_mode, bool resizing)
659	{
660	ASSERT(wnd->cur && (N_DELETED & get_mark(wnd->cur->link)));
661
662	cht_link_t *next = get_next(wnd->cur->link);
663
664	if (walk_mode == WM_LEAVE_JOIN) {
665	/* Never try to unlink join nodes. */
666	ASSERT(!(N_JOIN & get_mark(wnd->cur->link)));
667
668	mark_t pred_mark = get_mark(*wnd->ppred);
669	/* Succeed only of the predecessor is clean/normal or a join node. */
670	mark_t exp_pred_mark = (N_JOIN & pred_mark) ? pred_mark : N_NORMAL;
671
672	marked_ptr_t pred_link = make_link(wnd->cur, exp_pred_mark);
673	marked_ptr_t next_link = make_link(next, exp_pred_mark);
674
675	if (pred_link != cas_link(wnd->ppred, pred_link, next_link))
676	return false;
677	} else {
678	ASSERT(walk_mode == WM_MOVE_JOIN_FOLLOWS \|\| walk_mode == WM_NORMAL);
679	/* Move the JF mark if set. Clear DEL mark. */
680	mark_t cur_mark = N_JOIN_FOLLOWS & get_mark(wnd->cur->link);
681
682	/* The predecessor must be clean/normal. */
683	marked_ptr_t pred_link = make_link(wnd->cur, N_NORMAL);
684	/* Link to cur's successor keeping/copying cur's JF mark. */
685	marked_ptr_t next_link = make_link(next, cur_mark);
686
687	marked_ptr_t ret = cas_link(wnd->ppred, pred_link, next_link);
688
689	if (pred_link != ret) {
690	/* If we're not resizing the table there are no JF/JN nodes. */
691	*resizing = (walk_mode == WM_NORMAL)
692	&& (N_JOIN_FOLLOWS & get_mark(ret));
693	return false;
694	}
695	}
696
697	return true;
698	}
699
700
701	static bool find_wnd_and_gc_pred(cht_t *h, size_t hash, walk_mode_t walk_mode,
702	equal_pred_t pred, void pred_arg, wnd_t wnd, bool *resizing)
703	{
704	if (!wnd->cur)
705	return true;
706
707	/*
708	* A read barrier is not needed here to bring up the most recent
709	* node marks (esp the N_DELETED). At worst we'll try to delete
710	* an already deleted node; fail in delete_at(); and retry.
711	*/
712
713	size_t cur_hash = node_hash(h, wnd->cur);
714
715	while (cur_hash <= hash) {
716	/* GC any deleted nodes on the way. */
717	if (N_DELETED & get_mark(wnd->cur->link)) {
718	if (!gc_deleted_node(h, walk_mode, wnd, resizing)) {
719	/* Retry from the head of a bucket. */
720	return false;
721	}
722	} else {
723	/* Is this the node we were looking for? */
724	if (cur_hash == hash && pred(pred_arg, wnd->cur))
725	return true;
726
727	next_wnd(wnd);
728	}
729
730	/* The searched for node is not in the current bucket. */
731	if (!wnd->cur)
732	return true;
733
734	cur_hash = node_hash(h, wnd->cur);
735	}
736
737	/* The searched for node is not in the current bucket. */
738	return true;
739	}
740
741	/* todo: comment different semantics (eg deleted JN first w/ specific hash) */
742	static bool find_wnd_and_gc(cht_t *h, size_t hash, walk_mode_t walk_mode,
743	wnd_t wnd, bool resizing)
744	{
745	while (wnd->cur && node_hash(h, wnd->cur) < hash) {
746	/* GC any deleted nodes along the way to our desired node. */
747	if (N_DELETED & get_mark(wnd->cur->link)) {
748	if (!gc_deleted_node(h, walk_mode, wnd, resizing)) {
749	/* Failed to remove the garbage node. Retry. */
750	return false;
751	}
752	} else {
753	next_wnd(wnd);
754	}
755	}
756
757	/* wnd->cur may be 0 or even marked N_DELETED. */
758	return true;
759	}
760
761	static bool gc_deleted_node(cht_t h, walk_mode_t walk_mode, wnd_t wnd,
762	bool *resizing)
763	{
764	ASSERT(N_DELETED & get_mark(wnd->cur->link));
765
766	/* Skip deleted JOIN nodes. */
767	if (walk_mode == WM_LEAVE_JOIN && (N_JOIN & get_mark(wnd->cur->link))) {
768	next_wnd(wnd);
769	} else {
770	/* Ordinary deleted node or a deleted JOIN_FOLLOWS. */
771	ASSERT(walk_mode != WM_LEAVE_JOIN
772	\|\| !((N_JOIN \| N_JOIN_FOLLOWS) & get_mark(wnd->cur->link)));
773
774	/* Unlink an ordinary deleted node, move JOIN_FOLLOWS mark. */
775	if (!unlink_from_pred(wnd, walk_mode, resizing)) {
776	/* Retry. The predecessor was deleted, invalid, const, join_follows. */
777	return false;
778	}
779
780	free_later(h, wnd->cur);
781
782	/* Leave ppred as is. */
783	wnd->last = wnd->cur;
784	wnd->cur = get_next(wnd->cur->link);
785	}
786
787	return true;
788	}
789
790	static bool join_completed(cht_t h, const wnd_t wnd)
791	{
792	/*
793	* The table is shrinking and the searched for item is in a bucket
794	* appended to another. Check that the link joining these two buckets
795	* is visible and if not, make it visible to this cpu.
796	*/
797
798	/*
799	* Resizer ensures h->b->order stays the same for the duration of this
800	* func. We got here because there was an alternative head to search.
801	* The resizer waits for all preexisting readers to finish after
802	* it
803	*/
804	ASSERT(h->b->order > h->new_b->order);
805
806	/* Either we did not need the joining link or we have already followed it.*/
807	if (wnd->cur)
808	return true;
809
810	/* We have reached the end of a bucket. */
811
812	if (wnd->last) {
813	size_t last_seen_hash = node_hash(h, wnd->last);
814	size_t last_old_idx = calc_bucket_idx(last_seen_hash, h->b->order);
815	size_t move_src_idx = grow_idx(shrink_idx(last_old_idx));
816
817	/*
818	* Last was in the joining bucket - if the searched for node is there
819	* we will find it.
820	*/
821	if (move_src_idx != last_old_idx)
822	return true;
823	}
824
825	/*
826	* Reached the end of the bucket but no nodes from the joining bucket
827	* were seen. There should have at least been a JOIN node so we have
828	* definitely not seen (and followed) the joining link. Make the link
829	* visible and retry.
830	*/
831	read_barrier();
832	return false;
833	}
834
835	static void upd_resizing_head(cht_t h, size_t hash, marked_ptr_t *phead,
836	bool join_finishing, bool walk_mode)
837	{
838	cht_buckets_t *b = rcu_access(h->b);
839	size_t old_idx = calc_bucket_idx(hash, b->order);
840	size_t new_idx = calc_bucket_idx(hash, h->new_b->order);
841
842	marked_ptr_t *pold_head = &b->head[old_idx];
843	marked_ptr_t *pnew_head = &h->new_b->head[new_idx];
844
845	/* In any case, use the bucket in the new table. */
846	*phead = pnew_head;
847
848	/* Growing the table. */
849	if (b->order < h->new_b->order) {
850	size_t move_dest_idx = grow_idx(old_idx);
851	marked_ptr_t *pmoved_head = &h->new_b->head[move_dest_idx];
852
853	/* Complete moving the bucket from the old to the new table. */
854	move_head(pold_head, pmoved_head);
855
856	/* The hash belongs to the moved bucket. */
857	if (move_dest_idx == new_idx) {
858	/*
859	* move_head() makes the new head of the moved bucket visible.
860	* The new head may be marked with a JOIN_FOLLOWS
861	*/
862	ASSERT(!(N_CONST & get_mark(*pnew_head)));
863	*walk_mode = WM_MOVE_JOIN_FOLLOWS;
864	} else {
865	/*
866	* The hash belongs to the bucket that is the result of splitting
867	* the old/moved bucket, ie the bucket that contains the second
868	* half of the split/old/moved bucket.
869	*/
870
871	/* The moved bucket has not yet been split. */
872	if (N_NORMAL != get_mark(*pnew_head)) {
873	size_t split_hash = calc_split_hash(new_idx, h->new_b->order);
874	split_bucket(pmoved_head, pnew_head, split_hash);
875	/*
876	* split_bucket() makes the new head visible. No
877	* JOIN_FOLLOWS in this part of split bucket.
878	*/
879	ASSERT(N_NORMAL == get_mark(*pnew_head));
880	}
881
882	*walk_mode = WM_LEAVE_JOIN;
883	}
884	} else if (h->new_b->order < b->order ) {
885	/* Shrinking the table. */
886
887	size_t move_src_idx = grow_idx(new_idx);
888
889	/*
890	* Complete moving the bucket from the old to the new table.
891	* Makes a valid pnew_head visible if already moved.
892	*/
893	move_head(&b->head[move_src_idx], pnew_head);
894
895	/* Hash belongs to the bucket to be joined with the moved bucket. */
896	if (move_src_idx != old_idx) {
897	/* Bucket join not yet completed. */
898	if (N_INVALID != get_mark(*pold_head)) {
899	size_t split_hash = calc_split_hash(old_idx, b->order);
900	join_buckets(pold_head, pnew_head, split_hash);
901	}
902
903	/* The resizer sets pold_head to 0 when all cpus see the bucket join.*/
904	join_finishing = (0 != get_next(pold_head));
905	}
906
907	/* move_head() or join_buckets() makes it so or makes the mark visible.*/
908	ASSERT(N_INVALID == get_mark(*pold_head));
909	/* move_head() makes it visible. No JOIN_FOLLOWS used when shrinking. */
910	ASSERT(N_NORMAL == get_mark(*pnew_head));
911
912	*walk_mode = WM_LEAVE_JOIN;
913	} else {
914	/*
915	* Final stage of resize. The resizer is waiting for all
916	* readers to notice that the old table had been replaced.
917	*/
918	ASSERT(b == h->new_b);
919	*walk_mode = WM_NORMAL;
920	}
921	}
922
923
924	#if 0
925	static void move_head(marked_ptr_t psrc_head, marked_ptr_t pdest_head)
926	{
927	start_head_move(psrc_head);
928	complete_head_move(psrc_head, pdest_head);
929	}
930	#endif
931
932	static void help_head_move(marked_ptr_t psrc_head, marked_ptr_t pdest_head)
933	{
934	/* Head move has to in progress already when calling this func. */
935	ASSERT(N_CONST & get_mark(*psrc_head));
936
937	/* Head already moved. */
938	if (N_INVALID == get_mark(*psrc_head)) {
939	/* Effects of the head move have not yet propagated to this cpu. */
940	if (N_INVALID == get_mark(*pdest_head)) {
941	/* Make the move visible on this cpu. */
942	read_barrier();
943	ASSERT(!(N_CONST & get_mark(*pdest_head)));
944	}
945	} else {
946	complete_head_move(psrc_head, pdest_head);
947	}
948	}
949
950	static void start_head_move(marked_ptr_t *psrc_head)
951	{
952	/* Mark src head immutable. */
953	mark_const(psrc_head);
954	}
955
956	static void mark_const(marked_ptr_t *psrc_head)
957	{
958	marked_ptr_t ret, src_link;
959
960	/* Mark src head immutable. */
961	do {
962	cht_link_t next = get_next(psrc_head);
963	src_link = make_link(next, N_NORMAL);
964
965	/* Mark the normal/clean src link immutable/const. */
966	ret = cas_link(psrc_head, next, N_NORMAL, next, N_CONST);
967	} while(ret != src_link && !(N_CONST & get_mark(ret)));
968	}
969
970	static void complete_head_move(marked_ptr_t psrc_head, marked_ptr_t pdest_head)
971	{
972	ASSERT(N_JOIN_FOLLOWS != get_mark(*psrc_head));
973	ASSERT(N_CONST & get_mark(*psrc_head));
974
975	cht_link_t next = get_next(psrc_head);
976	/* todo: cas order barrier */
977	cas_link(pdest_head, 0, N_INVALID, next, N_NORMAL);
978	/* todo: cas order barrier */
979	cas_link(psrc_head, next, N_CONST, next, N_INVALID);
980	}
981
982	static void split_bucket(cht_t h, marked_ptr_t psrc_head,
983	marked_ptr_t *pdest_head, size_t split_hash)
984	{
985	/* Already split. */
986	if (N_NORMAL == get_mark(*pdest_head))
987	return;
988
989	/*
990	* L == Last node of the first part of the split bucket. That part
991	* remains in the original/src bucket.
992	* F == First node of the second part of the split bucket. That part
993	* will be referenced from the dest bucket head.
994	*
995	* We want to first mark a clean L as JF so that updaters unaware of
996	* the split (or table resize):
997	* - do not insert a new node between L and F
998	* - do not unlink L (that is why it has to be clean/normal)
999	* - do not unlink F
1000	*
1001	* Then we can safely mark F as JN even if it has been marked deleted.
1002	* Once F is marked as JN updaters aware of table resize will not
1003	* attempt to unlink it (JN will have two predecessors - we cannot
1004	* safely unlink from both at the same time). Updaters unaware of
1005	* ongoing resize can reach F only via L and that node is already
1006	* marked JF, so they won't unlink F.
1007	*
1008	* Last, link the new/dest head to F.
1009	*
1010	*
1011	* 0) ,-- split_hash, first hash of the dest bucket
1012	* v
1013	* [src_head \| N] -> .. -> [L] -> [F]
1014	* [dest_head \| Inv]
1015	*
1016	* 1) ,-- split_hash
1017	* v
1018	* [src_head \| N] -> .. -> [JF] -> [F]
1019	* [dest_head \| Inv]
1020	*
1021	* 2) ,-- split_hash
1022	* v
1023	* [src_head \| N] -> .. -> [JF] -> [JN]
1024	* [dest_head \| Inv]
1025	*
1026	* 2) ,-- split_hash
1027	* v
1028	* [src_head \| N] -> .. -> [JF] -> [JN]
1029	* ^
1030	* [dest_head \| N] -----------------'
1031	*/
1032	wnd_t wnd;
1033	bool done;
1034
1035	rcu_read_lock();
1036
1037	/* Mark the last node of the first part of the split bucket as JF. */
1038	mark_join_follows(h, psrc_head, split_hash, &wnd);
1039
1040	/* todo: cas order barrier */
1041
1042	/* There are nodes in the dest bucket, ie the second part of the split. */
1043	if (wnd.cur) {
1044	/*
1045	* Mark the first node of the dest bucket as a join node so
1046	* updaters do not attempt to unlink it if it is deleted.
1047	*/
1048	mark_join_node(wnd.cur);
1049	} else {
1050	/*
1051	* Second part of the split bucket is empty. There are no nodes
1052	* to mark as JOIN nodes and there never will be.
1053	*/
1054	}
1055
1056	/* Link the dest head to the second part of the split. */
1057	cas_link(pdest_head, 0, N_INVALID, wnd.cur, N_NORMAL);
1058
1059	rcu_read_unlock();
1060	}
1061
1062	static void mark_join_follows(cht_t h, marked_ptr_t psrc_head,
1063	size_t split_hash, wnd_t *wnd)
1064	{
1065	/* See comment in split_bucket(). */
1066
1067	bool done;
1068	do {
1069	bool dummy;
1070	wnd->ppred = psrc_head;
1071	wnd->cur = get_next(*psrc_head);
1072
1073	/*
1074	* Find the split window, ie the last node of the first part of
1075	* the split bucket and the its successor - the first node of
1076	* the second part of the split bucket. Retry if GC failed.
1077	*/
1078	if (!find_wnd_and_gc(h, split_hash, WM_MOVE_JOIN_FOLLOWS, wnd, &dummy))
1079	continue;
1080
1081	/*
1082	* Mark the last node of the first half of the split bucket
1083	* that a join node follows. It must be clean/normal.
1084	*/
1085	marked_ptr_t ret
1086	= cas_link(wnd->ppred, wnd->cur, N_NORMAL, wnd->cur, N_JOIN_FOLLOWS);
1087
1088	/* Successfully marked as a JF node or already marked that way. */
1089	done = (ret == make_link(wnd->cur, N_NORMAL))
1090	\|\| (N_JOIN_FOLLOWS & get_mark(ret));
1091	} while (!done);
1092	}
1093
1094	static void mark_join_node(cht_link_t *join_node)
1095	{
1096	/* See comment in split_bucket(). */
1097
1098	bool done;
1099	do {
1100	cht_link_t next = get_next(join_node);
1101	mark_t mark = get_mark(*join_node);
1102
1103	/*
1104	* May already be marked as deleted, but it won't be unlinked
1105	* because its predecessor is marked with JOIN_FOLLOWS or CONST.
1106	*/
1107	marked_ptr_t ret
1108	= cas_link(&join_node->link, next, mark, next, mark \| N_JOIN);
1109
1110	/* Successfully marked or already marked as a join node. */
1111	done = (ret == make_link(next, mark))
1112	\|\| (N_JOIN & get_mark(ret));
1113	} while(!done);
1114	}
1115
1116
1117	static void join_buckets(cht_t h, marked_ptr_t psrc_head,
1118	marked_ptr_t *pdest_head, size_t split_hash)
1119	{
1120	/* Buckets already joined. */
1121	if (N_INVALID == get_mark(*psrc_head))
1122	return;
1123	/*
1124	* F == First node of psrc_head, ie the bucket we want to append
1125	* to (ie join with) the bucket starting at pdest_head.
1126	* L == Last node of pdest_head, ie the bucket that psrc_head will
1127	* be appended to.
1128	*
1129	* (1) We first mark psrc_head immutable to signal that a join is
1130	* in progress and so that updaters unaware of the join (or table
1131	* resize):
1132	* - do not insert new nodes between the head psrc_head and F
1133	* - do not unlink F (it may already be marked deleted)
1134	*
1135	* (2) Next, F is marked as a join node. Updaters aware of table resize
1136	* will not attempt to unlink it. We cannot safely/atomically unlink
1137	* the join node because it will be pointed to from two different
1138	* buckets. Updaters unaware of resize will fail to unlink the join
1139	* node due to the head being marked immutable.
1140	*
1141	* (3) Then the tail of the bucket at pdest_head is linked to the join
1142	* node. From now on, nodes in both buckets can be found via pdest_head.
1143	*
1144	* (4) Last, mark immutable psrc_head as invalid. It signals updaters
1145	* that the join is complete and they can insert new nodes (originally
1146	* destined for psrc_head) into pdest_head.
1147	*
1148	* Note that pdest_head keeps pointing at the join node. This allows
1149	* lookups and updaters to determine if they should see a link between
1150	* the tail L and F when searching for nodes originally in psrc_head
1151	* via pdest_head. If they reach the tail of pdest_head without
1152	* encountering any nodes of psrc_head, either there were no nodes
1153	* in psrc_head to begin with or the link between L and F did not
1154	* yet propagate to their cpus. If psrc_head was empty, it remains
1155	* NULL. Otherwise psrc_head points to a join node (it will not be
1156	* unlinked until table resize completes) and updaters/lookups
1157	* should issue a read_barrier() to make the link [L]->[JN] visible.
1158	*
1159	* 0) ,-- split_hash, first hash of the src bucket
1160	* v
1161	* [dest_head \| N]-> .. -> [L]
1162	* [src_head \| N]--> [F] -> ..
1163	* ^
1164	* ` split_hash, first hash of the src bucket
1165	*
1166	* 1) ,-- split_hash
1167	* v
1168	* [dest_head \| N]-> .. -> [L]
1169	* [src_head \| C]--> [F] -> ..
1170	*
1171	* 2) ,-- split_hash
1172	* v
1173	* [dest_head \| N]-> .. -> [L]
1174	* [src_head \| C]--> [JN] -> ..
1175	*
1176	* 3) ,-- split_hash
1177	* v
1178	* [dest_head \| N]-> .. -> [L] --+
1179	* v
1180	* [src_head \| C]-------------> [JN] -> ..
1181	*
1182	* 4) ,-- split_hash
1183	* v
1184	* [dest_head \| N]-> .. -> [L] --+
1185	* v
1186	* [src_head \| Inv]-----------> [JN] -> ..
1187	*/
1188
1189	rcu_read_lock();
1190
1191	/* Mark src_head immutable - signals updaters bucket join started. */
1192	mark_const(psrc_head);
1193	/* todo: cas order barrier*/
1194
1195	cht_link_t join_node = get_next(psrc_head);
1196
1197	if (join_node) {
1198	mark_join_node(join_node);
1199	/* todo: cas order barrier*/
1200
1201	link_to_join_node(h, pdest_head, join_node, split_hash);
1202	/* todo: cas order barrier*/
1203	}
1204
1205	cas_link(psrc_head, join_node, N_CONST, join_node, N_INVALID);
1206
1207	rcu_read_unlock();
1208	}
1209
1210	static void link_to_join_node(cht_t h, marked_ptr_t pdest_head,
1211	cht_link_t *join_node, size_t split_hash)
1212	{
1213	bool done;
1214	do {
1215	wnd_t wnd = {
1216	.ppred = pdest_head,
1217	.cur = get_next(*pdest_head)
1218	};
1219
1220	bool dummy;
1221
1222	if (!find_wnd_and_gc(h, split_hash, WM_LEAVE_JOIN, &wnd, &dummy))
1223	continue;
1224
1225	if (wnd.cur) {
1226	/* Must be from the new appended bucket. */
1227	ASSERT(split_hash <= node_hash(h, wnd.cur));
1228	return;
1229	}
1230
1231	/* Reached the tail of pdest_head - link it to the join node. */
1232	marked_ptr_t ret = cas_link(wnd.ppred, 0, N_NORMAL, join_node, N_NORMAL);
1233
1234	done = (ret == make_link(0, N_NORMAL));
1235	} while (!done);
1236	}
1237
1238	static void free_later(cht_t h, cht_link_t item)
1239	{
1240	/*
1241	* remove_callback only works as rcu_func_t because rcu_link is the first
1242	* field in cht_link_t.
1243	*/
1244	rcu_call(&item->rcu_link, (rcu_func_t)h->op->remove_callback);
1245
1246	item_removed(h);
1247	}
1248
1249	static void item_removed(cht_t *h)
1250	{
1251	/* todo: impl */
1252	}
1253
1254	static void item_inserted(cht_t *h)
1255	{
1256	/* todo: impl */
1257	}
1258
1259	static void resize_table(void *arg)
1260	{
1261	cht_t h = (cht_t )arg;
1262
1263	ASSERT(h->b);
1264	ASSERT(0 < (read_barrier(), atomic_get(&h->resize_reqs)));
1265
1266	/* Load the most current h->item_cnt. */
1267	read_barrier();
1268	do {
1269	size_t cur_items = h->item_cnt;
1270	size_t bucket_cnt = (1 << h->b->order);
1271
1272	if (cur_items >= CHT_MAX_LOAD * bucket_cnt) {
1273	grow_table(h);
1274	} else if (cur_items <= CHT_MAX_LOAD * bucket_cnt / 4) {
1275	shrink_table(h);
1276	}
1277
1278	/* Load the most current h->item_cnt and h->resize_reqs. */
1279	read_barrier();
1280	} while (0 < atomic_predec(&h->resize_reqs));
1281	}
1282
1283	static void grow_table(cht_t *h)
1284	{
1285	if (h->b->order >= CHT_MAX_ORDER)
1286	return;
1287
1288	h->new_b = alloc_buckets(h->b->order + 1, true);
1289
1290	/* Failed to alloc a new table - try next time the resizer is run. */
1291	if (!h->new_b)
1292	return;
1293
1294	/* Wait for all readers and updaters to see the initialized new table. */
1295	rcu_synchronize();
1296
1297	size_t old_bucket_cnt = (1 << h->b->order);
1298
1299	/*
1300	* Give updaters a chance to help out with the resize. Do the minimum
1301	* work needed to announce a resize is in progress, ie start moving heads.
1302	*/
1303	for (size_t idx = 0; idx < old_bucket_cnt; ++idx) {
1304	start_head_move(&h->b->head[idx]);
1305	}
1306
1307	/* Complete moving heads and split any buckets not yet split by updaters. */
1308	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
1309	marked_ptr_t *move_dest_head = &h->new_b->head[grow_idx(old_idx)];
1310	marked_ptr_t *move_src_head = &h->b->head[old_idx];
1311
1312	/* Head move not yet completed. */
1313	if (N_INVALID != get_mark(*move_src_head)) {
1314	complete_head_move(move_src_head, move_dest_head);
1315	}
1316
1317	size_t split_idx = grow_to_split_idx(old_idx);
1318	size_t split_hash = calc_split_hash(split_idx, h->new_b->order);
1319	marked_ptr_t *split_dest_head = &h->new_b->head[split_idx];
1320
1321	split_bucket(h, move_dest_head, split_dest_head, split_hash);
1322	}
1323
1324	/*
1325	* Wait for all updaters to notice the new heads. Once everyone sees
1326	* the invalid old bucket heads they will know a resize is in progress
1327	* and updaters will modify the correct new buckets.
1328	*/
1329	rcu_synchronize();
1330
1331	/* Clear the JOIN_FOLLOWS mark and remove the link between the split buckets.*/
1332	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
1333	size_t new_idx = grow_idx(old_idx);
1334
1335	cleanup_join_follows(h, &h->new_b[new_idx]);
1336	}
1337
1338	/*
1339	* Wait for everyone to notice that buckets were split, ie link connecting
1340	* the join follows and join node has been cut.
1341	*/
1342	rcu_synchronize();
1343
1344	/* Clear the JOIN mark and GC any deleted join nodes. */
1345	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
1346	size_t new_idx = grow_to_split_idx(old_idx);
1347
1348	cleanup_join_node(h, &h->new_b[new_idx]);
1349	}
1350
1351	/* Wait for everyone to see that the table is clear of any resize marks. */
1352	rcu_synchronize();
1353
1354	cht_buckets_t *old_b = h->b;
1355	rcu_assign(h->b, h->new_b);
1356
1357	/* Wait for everyone to start using the new table. */
1358	rcu_synchronize();
1359
1360	free(old_b);
1361
1362	/* Not needed; just for increased readability. */
1363	h->new_b = 0;
1364	}
1365
1366	static void shrink_table(cht_t *h)
1367	{
1368	if (h->b->order <= CHT_MIN_ORDER)
1369	return;
1370
1371	h->new_b = alloc_buckets(h->b->order - 1, true);
1372
1373	/* Failed to alloc a new table - try next time the resizer is run. */
1374	if (!h->new_b)
1375	return;
1376
1377	/* Wait for all readers and updaters to see the initialized new table. */
1378	rcu_synchronize();
1379
1380	size_t old_bucket_cnt = (1 << h->b->order);
1381
1382	/*
1383	* Give updaters a chance to help out with the resize. Do the minimum
1384	* work needed to announce a resize is in progress, ie start moving heads.
1385	*/
1386	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
1387	size_t new_idx = shrink_idx(old_idx);
1388
1389	/* This bucket should be moved. */
1390	if (grow_idx(new_idx) == old_idx) {
1391	start_head_move(&h->b->head[old_idx]);
1392	} else {
1393	/* This bucket should join the moved bucket once the move is done.*/
1394	}
1395	}
1396
1397	/* Complete moving heads and join buckets with the moved buckets. */
1398	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
1399	size_t new_idx = shrink_idx(old_idx);
1400
1401	/* This bucket should be moved. */
1402	if (grow_idx(new_idx) == old_idx) {
1403	/* Head move not yet completed. */
1404	if (N_INVALID != get_mark(h->b->head[old_idx])) {
1405	complete_head_move(&h->b->head[old_idx], &h->new_b->head[new_idx]);
1406	}
1407	} else {
1408	/* This bucket should join the moved bucket. */
1409	size_t split_hash = calc_split_hash(old_idx, h->b->order);
1410	join_buckets(h, &h->b->head[old_idx], &h->new_b->head[new_idx],
1411	split_hash);
1412	}
1413	}
1414
1415	/*
1416	* Wait for all updaters to notice the new heads. Once everyone sees
1417	* the invalid old bucket heads they will know a resize is in progress
1418	* and updaters will modify the correct new buckets.
1419	*/
1420	rcu_synchronize();
1421
1422	/* Let everyone know joins are complete and fully visible. */
1423	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
1424	size_t move_src_idx = grow_idx(shrink_idx(old_idx));
1425
1426	/* Set the invalid joinee head to NULL. */
1427	if (old_idx != move_src_idx) {
1428	ASSERT(N_INVALID == h->b->head[old_idx]);
1429
1430	if (0 != get_next(h->b->head[old_idx]))
1431	h->b->head[old_idx] = make_link(0, N_INVALID);
1432	}
1433	}
1434
1435	/* todo comment join node vs reset joinee head*/
1436	rcu_synchronize();
1437
1438	size_t new_bucket_cnt = (1 << h->new_b->order);
1439
1440	/* Clear the JOIN mark and GC any deleted join nodes. */
1441	for (size_t new_idx = 0; new_idx < new_bucket_cnt; ++new_idx) {
1442	cleanup_join_node(h, &h->new_b[new_idx]);
1443	}
1444
1445	/* Wait for everyone to see that the table is clear of any resize marks. */
1446	rcu_synchronize();
1447
1448	cht_buckets_t *old_b = h->b;
1449	rcu_assign(h->b, h->new_b);
1450
1451	/* Wait for everyone to start using the new table. */
1452	rcu_synchronize();
1453
1454	free(old_b);
1455
1456	/* Not needed; just for increased readability. */
1457	h->new_b = 0;
1458	}
1459
1460	static void cleanup_join_node(cht_t h, marked_ptr_t new_head)
1461	{
1462	rcu_read_lock();
1463
1464	cht_link_t cur = get_next(new_head);
1465
1466	while (cur) {
1467	/* Clear the join node's JN mark - even if it is marked as deleted. */
1468	if (N_JOIN & get_mark(cur->link)) {
1469	clear_join_and_gc(h, cur, new_head);
1470	break;
1471	}
1472
1473	cur = get_next(cur->link);
1474	}
1475
1476	rcu_read_unlock();
1477	}
1478
1479	static void clear_join_and_gc(cht_t h, cht_link_t join_node,
1480	marked_ptr_t *new_head)
1481	{
1482	ASSERT(join_node && (N_JOIN & get_mark(join_node->link)));
1483
1484	bool done;
1485
1486	/* Clear the JN mark. */
1487	do {
1488	marked_ptr_t jn_link = join_node->link;
1489	cht_link_t *next = get_next(jn_link);
1490	/* Clear the JOIN mark but keep the DEL mark if present. */
1491	mark_t cleared_mark = get_mark(jn_link) & N_DELETED;
1492
1493	marked_ptr_t ret =
1494	_cas_link(&join_node->link, jn_link, make_link(next, cleared_mark));
1495
1496	/* Done if the mark was cleared. Retry if a new node was inserted. */
1497	done = (ret == jn_link);
1498	} while (!done);
1499
1500	if (!(N_DELETED & get_mark(join_node->link)))
1501	return;
1502
1503	/* The join node had been marked as deleted - GC it. */
1504
1505	size_t jn_hash = node_hash(h, join_node);
1506	do {
1507	bool resizing;
1508
1509	wnd_t wnd = {
1510	.ppred = new_head,
1511	.cur = get_next(*new_head)
1512	};
1513
1514	done = find_wnd_and_gc_pred(h, jn_hash, WM_NORMAL, same_node_pred,
1515	join_node, &wnd, &resizing);
1516
1517	ASSERT(!resizing);
1518	} while (!done);
1519	}
1520
1521	static void cleanup_join_follows(cht_t h, marked_ptr_t new_head)
1522	{
1523	ASSERT(new_head);
1524
1525	rcu_read_lock();
1526
1527	wnd_t wnd = {
1528	.ppred = 0,
1529	.cur = 0
1530	};
1531	marked_ptr_t *cur_link = new_head;
1532
1533	/*
1534	* Find the non-deleted node with a JF mark and clear the JF mark.
1535	* The JF node may be deleted and/or the mark moved to its neighbors
1536	* at any time. Therefore, we GC deleted nodes until we find the JF
1537	* node in order to remove stale/deleted JF nodes left behind eg by
1538	* delayed threads that did not yet get a chance to unlink the deleted
1539	* JF node and move its mark.
1540	*
1541	* Note that the head may be marked JF (but never DELETED).
1542	*/
1543	while (true) {
1544	bool is_jf_node = N_JOIN_FOLLOWS & get_mark(*cur_link);
1545
1546	/* GC any deleted nodes on the way - even deleted JOIN_FOLLOWS. */
1547	if (N_DELETED & get_mark(*cur_link)) {
1548	ASSERT(cur_link != new_head);
1549	ASSERT(wnd.ppred && wnd.cur);
1550	ASSERT(cur_link == &wnd.cur->link);
1551
1552	bool dummy;
1553	bool deleted = gc_deleted_node(h, WM_MOVE_JOIN_FOLLOWS, &wnd, &dummy);
1554
1555	/* Failed to GC or collected a deleted JOIN_FOLLOWS. */
1556	if (!deleted \|\| is_jf_node) {
1557	/* Retry from the head of the bucket. */
1558	cur_link = new_head;
1559	continue;
1560	}
1561	} else {
1562	/* Found a non-deleted JF. Clear its JF mark. */
1563	if (is_jf_node) {
1564	cht_link_t next = get_next(cur_link);
1565	marked_ptr_t ret
1566	= cas_link(cur_link, next, N_JOIN_FOLLOWS, 0, N_NORMAL);
1567
1568	/* Successfully cleared the JF mark of a non-deleted node. */
1569	if (ret == make_link(next, N_JOIN_FOLLOWS)) {
1570	break;
1571	} else {
1572	/*
1573	* The JF node had been deleted or a new node inserted
1574	* right after it. Retry from the head.
1575	*/
1576	cur_link = new_head;
1577	continue;
1578	}
1579	} else {
1580	wnd.ppred = cur_link;
1581	wnd.cur = get_next(*cur_link);
1582	}
1583	}
1584
1585	/* We must encounter a JF node before we reach the end of the bucket. */
1586	ASSERT(wnd.cur);
1587	cur_link = &wnd.cur->link;
1588	}
1589
1590	rcu_read_unlock();
1591	}
1592
1593
1594	static size_t calc_split_hash(size_t split_idx, size_t order)
1595	{
1596	ASSERT(1 <= order && order <= 8 * sizeof(size_t));
1597	return split_idx << (8 * sizeof(size_t) - order);
1598	}
1599
1600	static size_t calc_bucket_idx(size_t hash, size_t order)
1601	{
1602	ASSERT(1 <= order && order <= 8 * sizeof(size_t));
1603	return hash >> (8 * sizeof(size_t) - order);
1604	}
1605
1606	static size_t grow_to_split_idx(size_t old_idx)
1607	{
1608	return grow_idx(old_idx) \| 1;
1609	}
1610
1611	static size_t grow_idx(size_t idx)
1612	{
1613	return idx << 1;
1614	}
1615
1616	static size_t shrink_idx(size_t idx)
1617	{
1618	return idx >> 1;
1619	}
1620
1621
1622	static size_t key_hash(cht_t h, void key)
1623	{
1624	return hash_mix(h->op->key_hash(key));
1625	}
1626
1627	static size_t node_hash(cht_t h, const cht_link_t item)
1628	{
1629	return hash_mix(h->op->hash(item));
1630	}
1631
1632
1633	static marked_ptr_t make_link(const cht_link_t *next, mark_t mark)
1634	{
1635	marked_ptr_t ptr = (marked_ptr_t) next;
1636
1637	ASSERT(!(ptr & N_MARK_MASK));
1638	ASSERT(!((unsigned)mark & ~N_MARK_MASK));
1639
1640	return ptr \| mark;
1641	}
1642
1643
1644	static cht_link_t * get_next(marked_ptr_t link)
1645	{
1646	return (cht_link_t*)(link & ~N_MARK_MASK);
1647	}
1648
1649
1650	static mark_t get_mark(marked_ptr_t link)
1651	{
1652	return (mark_t)(link & N_MARK_MASK);
1653	}
1654
1655
1656	static void next_wnd(wnd_t *wnd)
1657	{
1658	ASSERT(wnd);
1659	ASSERT(wnd->cur);
1660
1661	wnd->last = wnd->cur;
1662	wnd->ppred = &wnd->cur->link;
1663	wnd->cur = get_next(wnd->cur->link);
1664	}
1665
1666
1667	static bool same_node_pred(void node, const cht_link_t item2)
1668	{
1669	const cht_link_t item1 = (const cht_link_t) node;
1670	return item1 == item2;
1671	}
1672
1673	static marked_ptr_t cas_link(marked_ptr_t link, const cht_link_t cur_next,
1674	mark_t cur_mark, const cht_link_t *new_next, mark_t new_mark)
1675	{
1676	return _cas_link(link, make_link(cur_next, cur_mark),
1677	make_link(new_next, new_mark));
1678	}
1679
1680	static marked_ptr_t _cas_link(marked_ptr_t *link, marked_ptr_t cur,
1681	marked_ptr_t new)
1682	{
1683	/*
1684	* cas(x) on the same location x on one cpu must be ordered, but do not
1685	* have to be ordered wrt to other cas(y) to a different location y
1686	* on the same cpu.
1687	*
1688	* cas(x) must act as a write barrier on x, ie if cas(x) succeeds
1689	* and is observed by another cpu, then all cpus must be able to
1690	* make the effects of cas(x) visible just by issuing a load barrier.
1691	* For example:
1692	* cpu1 cpu2 cpu3
1693	* cas(x, 0 -> 1), succeeds
1694	* cas(x, 0 -> 1), fails
1695	* MB
1696	* y = 7
1697	* sees y == 7
1698	* loadMB must be enough to make cas(x) on cpu3 visible to cpu1, ie x == 1.
1699	*
1700	* If cas() did not work this way:
1701	* - our head move protocol would not be correct.
1702	* - freeing an item linked to a moved head after another item was
1703	* inserted in front of it, would require more than one grace period.
1704	*/
1705	}
1706
1707	/** @}
1708	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: