1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
3
4 #ifndef _IP_SET_HASH_GEN_H
5 #define _IP_SET_HASH_GEN_H
6
7 #include <linux/rcupdate.h>
8 #include <linux/jhash.h>
9 #include <linux/types.h>
10 #include <linux/netfilter/nfnetlink.h>
11 #include <linux/netfilter/ipset/ip_set.h>
12
13 #define __ipset_dereference(p) \
14 rcu_dereference_protected(p, 1)
15 #define ipset_dereference_nfnl(p) \
16 rcu_dereference_protected(p, \
17 lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
18 #define ipset_dereference_set(p, set) \
19 rcu_dereference_protected(p, \
20 lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
21 lockdep_is_held(&(set)->lock))
22 #define ipset_dereference_bh_nfnl(p) \
23 rcu_dereference_bh_check(p, \
24 lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
25
26 /* Hashing which uses arrays to resolve clashing. The hash table is resized
27 * (doubled) when searching becomes too long.
28 * Internally jhash is used with the assumption that the size of the
29 * stored data is a multiple of sizeof(u32).
30 *
31 * Readers and resizing
32 *
33 * Resizing can be triggered by userspace command only, and those
34 * are serialized by the nfnl mutex. During resizing the set is
35 * read-locked, so the only possible concurrent operations are
36 * the kernel side readers. Those must be protected by proper RCU locking.
37 */
38
39 /* Number of elements to store in an initial array block */
40 #define AHASH_INIT_SIZE 4
41 /* Max number of elements to store in an array block */
42 #define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE)
43 /* Max muber of elements in the array block when tuned */
44 #define AHASH_MAX_TUNED 64
45
46 /* Max number of elements can be tuned */
47 #ifdef IP_SET_HASH_WITH_MULTI
48 #define AHASH_MAX(h) ((h)->ahash_max)
49
50 static u8
tune_ahash_max(u8 curr,u32 multi)51 tune_ahash_max(u8 curr, u32 multi)
52 {
53 u32 n;
54
55 if (multi < curr)
56 return curr;
57
58 n = curr + AHASH_INIT_SIZE;
59 /* Currently, at listing one hash bucket must fit into a message.
60 * Therefore we have a hard limit here.
61 */
62 return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
63 }
64
65 #define TUNE_AHASH_MAX(h, multi) \
66 ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
67 #else
68 #define AHASH_MAX(h) AHASH_MAX_SIZE
69 #define TUNE_AHASH_MAX(h, multi)
70 #endif
71
72 /* A hash bucket */
73 struct hbucket {
74 struct rcu_head rcu; /* for call_rcu */
75 /* Which positions are used in the array */
76 DECLARE_BITMAP(used, AHASH_MAX_TUNED);
77 u8 size; /* size of the array */
78 u8 pos; /* position of the first free entry */
79 unsigned char value[] /* the array of the values */
80 __aligned(__alignof__(u64));
81 };
82
83 /* Region size for locking == 2^HTABLE_REGION_BITS */
84 #define HTABLE_REGION_BITS 10
85 #define ahash_numof_locks(htable_bits) \
86 ((htable_bits) < HTABLE_REGION_BITS ? 1 \
87 : jhash_size((htable_bits) - HTABLE_REGION_BITS))
88 #define ahash_sizeof_regions(htable_bits) \
89 (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region))
90 #define ahash_region(n, htable_bits) \
91 ((n) % ahash_numof_locks(htable_bits))
92 #define ahash_bucket_start(h, htable_bits) \
93 ((htable_bits) < HTABLE_REGION_BITS ? 0 \
94 : (h) * jhash_size(HTABLE_REGION_BITS))
95 #define ahash_bucket_end(h, htable_bits) \
96 ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits) \
97 : ((h) + 1) * jhash_size(HTABLE_REGION_BITS))
98
99 struct htable_gc {
100 struct delayed_work dwork;
101 struct ip_set *set; /* Set the gc belongs to */
102 u32 region; /* Last gc run position */
103 };
104
105 /* The hash table: the table size stored here in order to make resizing easy */
106 struct htable {
107 atomic_t ref; /* References for resizing */
108 atomic_t uref; /* References for dumping and gc */
109 u8 htable_bits; /* size of hash table == 2^htable_bits */
110 u32 maxelem; /* Maxelem per region */
111 struct ip_set_region *hregion; /* Region locks and ext sizes */
112 struct hbucket __rcu *bucket[]; /* hashtable buckets */
113 };
114
115 #define hbucket(h, i) ((h)->bucket[i])
116 #define ext_size(n, dsize) \
117 (sizeof(struct hbucket) + (n) * (dsize))
118
119 #ifndef IPSET_NET_COUNT
120 #define IPSET_NET_COUNT 1
121 #endif
122
123 /* Book-keeping of the prefixes added to the set */
124 struct net_prefixes {
125 u32 nets[IPSET_NET_COUNT]; /* number of elements for this cidr */
126 u8 cidr[IPSET_NET_COUNT]; /* the cidr value */
127 };
128
129 /* Compute the hash table size */
130 static size_t
htable_size(u8 hbits)131 htable_size(u8 hbits)
132 {
133 size_t hsize;
134
135 /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */
136 if (hbits > 31)
137 return 0;
138 hsize = jhash_size(hbits);
139 if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *)
140 < hsize)
141 return 0;
142
143 return hsize * sizeof(struct hbucket *) + sizeof(struct htable);
144 }
145
146 #ifdef IP_SET_HASH_WITH_NETS
147 #if IPSET_NET_COUNT > 1
148 #define __CIDR(cidr, i) (cidr[i])
149 #else
150 #define __CIDR(cidr, i) (cidr)
151 #endif
152
153 /* cidr + 1 is stored in net_prefixes to support /0 */
154 #define NCIDR_PUT(cidr) ((cidr) + 1)
155 #define NCIDR_GET(cidr) ((cidr) - 1)
156
157 #ifdef IP_SET_HASH_WITH_NETS_PACKED
158 /* When cidr is packed with nomatch, cidr - 1 is stored in the data entry */
159 #define DCIDR_PUT(cidr) ((cidr) - 1)
160 #define DCIDR_GET(cidr, i) (__CIDR(cidr, i) + 1)
161 #else
162 #define DCIDR_PUT(cidr) (cidr)
163 #define DCIDR_GET(cidr, i) __CIDR(cidr, i)
164 #endif
165
166 #define INIT_CIDR(cidr, host_mask) \
167 DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask))
168
169 #ifdef IP_SET_HASH_WITH_NET0
170 /* cidr from 0 to HOST_MASK value and c = cidr + 1 */
171 #define NLEN (HOST_MASK + 1)
172 #define CIDR_POS(c) ((c) - 1)
173 #else
174 /* cidr from 1 to HOST_MASK value and c = cidr + 1 */
175 #define NLEN HOST_MASK
176 #define CIDR_POS(c) ((c) - 2)
177 #endif
178
179 #else
180 #define NLEN 0
181 #endif /* IP_SET_HASH_WITH_NETS */
182
183 #define SET_ELEM_EXPIRED(set, d) \
184 (SET_WITH_TIMEOUT(set) && \
185 ip_set_timeout_expired(ext_timeout(d, set)))
186
187 #endif /* _IP_SET_HASH_GEN_H */
188
189 #ifndef MTYPE
190 #error "MTYPE is not defined!"
191 #endif
192
193 #ifndef HTYPE
194 #error "HTYPE is not defined!"
195 #endif
196
197 #ifndef HOST_MASK
198 #error "HOST_MASK is not defined!"
199 #endif
200
201 /* Family dependent templates */
202
203 #undef ahash_data
204 #undef mtype_data_equal
205 #undef mtype_do_data_match
206 #undef mtype_data_set_flags
207 #undef mtype_data_reset_elem
208 #undef mtype_data_reset_flags
209 #undef mtype_data_netmask
210 #undef mtype_data_list
211 #undef mtype_data_next
212 #undef mtype_elem
213
214 #undef mtype_ahash_destroy
215 #undef mtype_ext_cleanup
216 #undef mtype_add_cidr
217 #undef mtype_del_cidr
218 #undef mtype_ahash_memsize
219 #undef mtype_flush
220 #undef mtype_destroy
221 #undef mtype_same_set
222 #undef mtype_kadt
223 #undef mtype_uadt
224
225 #undef mtype_add
226 #undef mtype_del
227 #undef mtype_test_cidrs
228 #undef mtype_test
229 #undef mtype_uref
230 #undef mtype_resize
231 #undef mtype_ext_size
232 #undef mtype_resize_ad
233 #undef mtype_head
234 #undef mtype_list
235 #undef mtype_gc_do
236 #undef mtype_gc
237 #undef mtype_gc_init
238 #undef mtype_variant
239 #undef mtype_data_match
240
241 #undef htype
242 #undef HKEY
243
244 #define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal)
245 #ifdef IP_SET_HASH_WITH_NETS
246 #define mtype_do_data_match IPSET_TOKEN(MTYPE, _do_data_match)
247 #else
248 #define mtype_do_data_match(d) 1
249 #endif
250 #define mtype_data_set_flags IPSET_TOKEN(MTYPE, _data_set_flags)
251 #define mtype_data_reset_elem IPSET_TOKEN(MTYPE, _data_reset_elem)
252 #define mtype_data_reset_flags IPSET_TOKEN(MTYPE, _data_reset_flags)
253 #define mtype_data_netmask IPSET_TOKEN(MTYPE, _data_netmask)
254 #define mtype_data_list IPSET_TOKEN(MTYPE, _data_list)
255 #define mtype_data_next IPSET_TOKEN(MTYPE, _data_next)
256 #define mtype_elem IPSET_TOKEN(MTYPE, _elem)
257
258 #define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy)
259 #define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup)
260 #define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr)
261 #define mtype_del_cidr IPSET_TOKEN(MTYPE, _del_cidr)
262 #define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize)
263 #define mtype_flush IPSET_TOKEN(MTYPE, _flush)
264 #define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
265 #define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
266 #define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
267 #define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
268
269 #define mtype_add IPSET_TOKEN(MTYPE, _add)
270 #define mtype_del IPSET_TOKEN(MTYPE, _del)
271 #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
272 #define mtype_test IPSET_TOKEN(MTYPE, _test)
273 #define mtype_uref IPSET_TOKEN(MTYPE, _uref)
274 #define mtype_resize IPSET_TOKEN(MTYPE, _resize)
275 #define mtype_ext_size IPSET_TOKEN(MTYPE, _ext_size)
276 #define mtype_resize_ad IPSET_TOKEN(MTYPE, _resize_ad)
277 #define mtype_head IPSET_TOKEN(MTYPE, _head)
278 #define mtype_list IPSET_TOKEN(MTYPE, _list)
279 #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
280 #define mtype_gc IPSET_TOKEN(MTYPE, _gc)
281 #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
282 #define mtype_variant IPSET_TOKEN(MTYPE, _variant)
283 #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
284
285 #ifndef HKEY_DATALEN
286 #define HKEY_DATALEN sizeof(struct mtype_elem)
287 #endif
288
289 #define htype MTYPE
290
291 #define HKEY(data, initval, htable_bits) \
292 ({ \
293 const u32 *__k = (const u32 *)data; \
294 u32 __l = HKEY_DATALEN / sizeof(u32); \
295 \
296 BUILD_BUG_ON(HKEY_DATALEN % sizeof(u32) != 0); \
297 \
298 jhash2(__k, __l, initval) & jhash_mask(htable_bits); \
299 })
300
301 /* The generic hash structure */
302 struct htype {
303 struct htable __rcu *table; /* the hash table */
304 struct htable_gc gc; /* gc workqueue */
305 u32 maxelem; /* max elements in the hash */
306 u32 initval; /* random jhash init value */
307 #ifdef IP_SET_HASH_WITH_MARKMASK
308 u32 markmask; /* markmask value for mark mask to store */
309 #endif
310 #ifdef IP_SET_HASH_WITH_MULTI
311 u8 ahash_max; /* max elements in an array block */
312 #endif
313 #ifdef IP_SET_HASH_WITH_NETMASK
314 u8 netmask; /* netmask value for subnets to store */
315 #endif
316 struct list_head ad; /* Resize add|del backlist */
317 struct mtype_elem next; /* temporary storage for uadd */
318 #ifdef IP_SET_HASH_WITH_NETS
319 struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */
320 #endif
321 };
322
323 /* ADD|DEL entries saved during resize */
324 struct mtype_resize_ad {
325 struct list_head list;
326 enum ipset_adt ad; /* ADD|DEL element */
327 struct mtype_elem d; /* Element value */
328 struct ip_set_ext ext; /* Extensions for ADD */
329 struct ip_set_ext mext; /* Target extensions for ADD */
330 u32 flags; /* Flags for ADD */
331 };
332
333 #ifdef IP_SET_HASH_WITH_NETS
334 /* Network cidr size book keeping when the hash stores different
335 * sized networks. cidr == real cidr + 1 to support /0.
336 */
337 static void
mtype_add_cidr(struct ip_set * set,struct htype * h,u8 cidr,u8 n)338 mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
339 {
340 int i, j;
341
342 spin_lock_bh(&set->lock);
343 /* Add in increasing prefix order, so larger cidr first */
344 for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) {
345 if (j != -1) {
346 continue;
347 } else if (h->nets[i].cidr[n] < cidr) {
348 j = i;
349 } else if (h->nets[i].cidr[n] == cidr) {
350 h->nets[CIDR_POS(cidr)].nets[n]++;
351 goto unlock;
352 }
353 }
354 if (j != -1) {
355 for (; i > j; i--)
356 h->nets[i].cidr[n] = h->nets[i - 1].cidr[n];
357 }
358 h->nets[i].cidr[n] = cidr;
359 h->nets[CIDR_POS(cidr)].nets[n] = 1;
360 unlock:
361 spin_unlock_bh(&set->lock);
362 }
363
364 static void
mtype_del_cidr(struct ip_set * set,struct htype * h,u8 cidr,u8 n)365 mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
366 {
367 u8 i, j, net_end = NLEN - 1;
368
369 spin_lock_bh(&set->lock);
370 for (i = 0; i < NLEN; i++) {
371 if (h->nets[i].cidr[n] != cidr)
372 continue;
373 h->nets[CIDR_POS(cidr)].nets[n]--;
374 if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
375 goto unlock;
376 for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
377 h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
378 h->nets[j].cidr[n] = 0;
379 goto unlock;
380 }
381 unlock:
382 spin_unlock_bh(&set->lock);
383 }
384 #endif
385
386 /* Calculate the actual memory size of the set data */
387 static size_t
mtype_ahash_memsize(const struct htype * h,const struct htable * t)388 mtype_ahash_memsize(const struct htype *h, const struct htable *t)
389 {
390 return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits);
391 }
392
393 /* Get the ith element from the array block n */
394 #define ahash_data(n, i, dsize) \
395 ((struct mtype_elem *)((n)->value + ((i) * (dsize))))
396
397 static void
mtype_ext_cleanup(struct ip_set * set,struct hbucket * n)398 mtype_ext_cleanup(struct ip_set *set, struct hbucket *n)
399 {
400 int i;
401
402 for (i = 0; i < n->pos; i++)
403 if (test_bit(i, n->used))
404 ip_set_ext_destroy(set, ahash_data(n, i, set->dsize));
405 }
406
407 /* Flush a hash type of set: destroy all elements */
408 static void
mtype_flush(struct ip_set * set)409 mtype_flush(struct ip_set *set)
410 {
411 struct htype *h = set->data;
412 struct htable *t;
413 struct hbucket *n;
414 u32 r, i;
415
416 t = ipset_dereference_nfnl(h->table);
417 for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
418 spin_lock_bh(&t->hregion[r].lock);
419 for (i = ahash_bucket_start(r, t->htable_bits);
420 i < ahash_bucket_end(r, t->htable_bits); i++) {
421 n = __ipset_dereference(hbucket(t, i));
422 if (!n)
423 continue;
424 if (set->extensions & IPSET_EXT_DESTROY)
425 mtype_ext_cleanup(set, n);
426 /* FIXME: use slab cache */
427 rcu_assign_pointer(hbucket(t, i), NULL);
428 kfree_rcu(n, rcu);
429 }
430 t->hregion[r].ext_size = 0;
431 t->hregion[r].elements = 0;
432 spin_unlock_bh(&t->hregion[r].lock);
433 }
434 #ifdef IP_SET_HASH_WITH_NETS
435 memset(h->nets, 0, sizeof(h->nets));
436 #endif
437 }
438
439 /* Destroy the hashtable part of the set */
440 static void
mtype_ahash_destroy(struct ip_set * set,struct htable * t,bool ext_destroy)441 mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
442 {
443 struct hbucket *n;
444 u32 i;
445
446 for (i = 0; i < jhash_size(t->htable_bits); i++) {
447 n = __ipset_dereference(hbucket(t, i));
448 if (!n)
449 continue;
450 if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
451 mtype_ext_cleanup(set, n);
452 /* FIXME: use slab cache */
453 kfree(n);
454 }
455
456 ip_set_free(t->hregion);
457 ip_set_free(t);
458 }
459
460 /* Destroy a hash type of set */
461 static void
mtype_destroy(struct ip_set * set)462 mtype_destroy(struct ip_set *set)
463 {
464 struct htype *h = set->data;
465 struct list_head *l, *lt;
466
467 if (SET_WITH_TIMEOUT(set))
468 cancel_delayed_work_sync(&h->gc.dwork);
469
470 mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
471 list_for_each_safe(l, lt, &h->ad) {
472 list_del(l);
473 kfree(l);
474 }
475 kfree(h);
476
477 set->data = NULL;
478 }
479
480 static bool
mtype_same_set(const struct ip_set * a,const struct ip_set * b)481 mtype_same_set(const struct ip_set *a, const struct ip_set *b)
482 {
483 const struct htype *x = a->data;
484 const struct htype *y = b->data;
485
486 /* Resizing changes htable_bits, so we ignore it */
487 return x->maxelem == y->maxelem &&
488 a->timeout == b->timeout &&
489 #ifdef IP_SET_HASH_WITH_NETMASK
490 x->netmask == y->netmask &&
491 #endif
492 #ifdef IP_SET_HASH_WITH_MARKMASK
493 x->markmask == y->markmask &&
494 #endif
495 a->extensions == b->extensions;
496 }
497
498 static void
mtype_gc_do(struct ip_set * set,struct htype * h,struct htable * t,u32 r)499 mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r)
500 {
501 struct hbucket *n, *tmp;
502 struct mtype_elem *data;
503 u32 i, j, d;
504 size_t dsize = set->dsize;
505 #ifdef IP_SET_HASH_WITH_NETS
506 u8 k;
507 #endif
508 u8 htable_bits = t->htable_bits;
509
510 spin_lock_bh(&t->hregion[r].lock);
511 for (i = ahash_bucket_start(r, htable_bits);
512 i < ahash_bucket_end(r, htable_bits); i++) {
513 n = __ipset_dereference(hbucket(t, i));
514 if (!n)
515 continue;
516 for (j = 0, d = 0; j < n->pos; j++) {
517 if (!test_bit(j, n->used)) {
518 d++;
519 continue;
520 }
521 data = ahash_data(n, j, dsize);
522 if (!ip_set_timeout_expired(ext_timeout(data, set)))
523 continue;
524 pr_debug("expired %u/%u\n", i, j);
525 clear_bit(j, n->used);
526 smp_mb__after_atomic();
527 #ifdef IP_SET_HASH_WITH_NETS
528 for (k = 0; k < IPSET_NET_COUNT; k++)
529 mtype_del_cidr(set, h,
530 NCIDR_PUT(DCIDR_GET(data->cidr, k)),
531 k);
532 #endif
533 t->hregion[r].elements--;
534 ip_set_ext_destroy(set, data);
535 d++;
536 }
537 if (d >= AHASH_INIT_SIZE) {
538 if (d >= n->size) {
539 t->hregion[r].ext_size -=
540 ext_size(n->size, dsize);
541 rcu_assign_pointer(hbucket(t, i), NULL);
542 kfree_rcu(n, rcu);
543 continue;
544 }
545 tmp = kzalloc(sizeof(*tmp) +
546 (n->size - AHASH_INIT_SIZE) * dsize,
547 GFP_ATOMIC);
548 if (!tmp)
549 /* Still try to delete expired elements. */
550 continue;
551 tmp->size = n->size - AHASH_INIT_SIZE;
552 for (j = 0, d = 0; j < n->pos; j++) {
553 if (!test_bit(j, n->used))
554 continue;
555 data = ahash_data(n, j, dsize);
556 memcpy(tmp->value + d * dsize,
557 data, dsize);
558 set_bit(d, tmp->used);
559 d++;
560 }
561 tmp->pos = d;
562 t->hregion[r].ext_size -=
563 ext_size(AHASH_INIT_SIZE, dsize);
564 rcu_assign_pointer(hbucket(t, i), tmp);
565 kfree_rcu(n, rcu);
566 }
567 }
568 spin_unlock_bh(&t->hregion[r].lock);
569 }
570
571 static void
mtype_gc(struct work_struct * work)572 mtype_gc(struct work_struct *work)
573 {
574 struct htable_gc *gc;
575 struct ip_set *set;
576 struct htype *h;
577 struct htable *t;
578 u32 r, numof_locks;
579 unsigned int next_run;
580
581 gc = container_of(work, struct htable_gc, dwork.work);
582 set = gc->set;
583 h = set->data;
584
585 spin_lock_bh(&set->lock);
586 t = ipset_dereference_set(h->table, set);
587 atomic_inc(&t->uref);
588 numof_locks = ahash_numof_locks(t->htable_bits);
589 r = gc->region++;
590 if (r >= numof_locks) {
591 r = gc->region = 0;
592 }
593 next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks;
594 if (next_run < HZ/10)
595 next_run = HZ/10;
596 spin_unlock_bh(&set->lock);
597
598 mtype_gc_do(set, h, t, r);
599
600 if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
601 pr_debug("Table destroy after resize by expire: %p\n", t);
602 mtype_ahash_destroy(set, t, false);
603 }
604
605 queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run);
606
607 }
608
609 static void
mtype_gc_init(struct htable_gc * gc)610 mtype_gc_init(struct htable_gc *gc)
611 {
612 INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc);
613 queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
614 }
615
616 static int
617 mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
618 struct ip_set_ext *mext, u32 flags);
619 static int
620 mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
621 struct ip_set_ext *mext, u32 flags);
622
623 /* Resize a hash: create a new hash table with doubling the hashsize
624 * and inserting the elements to it. Repeat until we succeed or
625 * fail due to memory pressures.
626 */
627 static int
mtype_resize(struct ip_set * set,bool retried)628 mtype_resize(struct ip_set *set, bool retried)
629 {
630 struct htype *h = set->data;
631 struct htable *t, *orig;
632 u8 htable_bits;
633 size_t hsize, dsize = set->dsize;
634 #ifdef IP_SET_HASH_WITH_NETS
635 u8 flags;
636 struct mtype_elem *tmp;
637 #endif
638 struct mtype_elem *data;
639 struct mtype_elem *d;
640 struct hbucket *n, *m;
641 struct list_head *l, *lt;
642 struct mtype_resize_ad *x;
643 u32 i, j, r, nr, key;
644 int ret;
645
646 #ifdef IP_SET_HASH_WITH_NETS
647 tmp = kmalloc(dsize, GFP_KERNEL);
648 if (!tmp)
649 return -ENOMEM;
650 #endif
651 orig = ipset_dereference_bh_nfnl(h->table);
652 htable_bits = orig->htable_bits;
653
654 retry:
655 ret = 0;
656 htable_bits++;
657 if (!htable_bits)
658 goto hbwarn;
659 hsize = htable_size(htable_bits);
660 if (!hsize)
661 goto hbwarn;
662 t = ip_set_alloc(hsize);
663 if (!t) {
664 ret = -ENOMEM;
665 goto out;
666 }
667 t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits));
668 if (!t->hregion) {
669 ip_set_free(t);
670 ret = -ENOMEM;
671 goto out;
672 }
673 t->htable_bits = htable_bits;
674 t->maxelem = h->maxelem / ahash_numof_locks(htable_bits);
675 for (i = 0; i < ahash_numof_locks(htable_bits); i++)
676 spin_lock_init(&t->hregion[i].lock);
677
678 /* There can't be another parallel resizing,
679 * but dumping, gc, kernel side add/del are possible
680 */
681 orig = ipset_dereference_bh_nfnl(h->table);
682 atomic_set(&orig->ref, 1);
683 atomic_inc(&orig->uref);
684 pr_debug("attempt to resize set %s from %u to %u, t %p\n",
685 set->name, orig->htable_bits, htable_bits, orig);
686 for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) {
687 /* Expire may replace a hbucket with another one */
688 rcu_read_lock_bh();
689 for (i = ahash_bucket_start(r, orig->htable_bits);
690 i < ahash_bucket_end(r, orig->htable_bits); i++) {
691 n = __ipset_dereference(hbucket(orig, i));
692 if (!n)
693 continue;
694 for (j = 0; j < n->pos; j++) {
695 if (!test_bit(j, n->used))
696 continue;
697 data = ahash_data(n, j, dsize);
698 if (SET_ELEM_EXPIRED(set, data))
699 continue;
700 #ifdef IP_SET_HASH_WITH_NETS
701 /* We have readers running parallel with us,
702 * so the live data cannot be modified.
703 */
704 flags = 0;
705 memcpy(tmp, data, dsize);
706 data = tmp;
707 mtype_data_reset_flags(data, &flags);
708 #endif
709 key = HKEY(data, h->initval, htable_bits);
710 m = __ipset_dereference(hbucket(t, key));
711 nr = ahash_region(key, htable_bits);
712 if (!m) {
713 m = kzalloc(sizeof(*m) +
714 AHASH_INIT_SIZE * dsize,
715 GFP_ATOMIC);
716 if (!m) {
717 ret = -ENOMEM;
718 goto cleanup;
719 }
720 m->size = AHASH_INIT_SIZE;
721 t->hregion[nr].ext_size +=
722 ext_size(AHASH_INIT_SIZE,
723 dsize);
724 RCU_INIT_POINTER(hbucket(t, key), m);
725 } else if (m->pos >= m->size) {
726 struct hbucket *ht;
727
728 if (m->size >= AHASH_MAX(h)) {
729 ret = -EAGAIN;
730 } else {
731 ht = kzalloc(sizeof(*ht) +
732 (m->size + AHASH_INIT_SIZE)
733 * dsize,
734 GFP_ATOMIC);
735 if (!ht)
736 ret = -ENOMEM;
737 }
738 if (ret < 0)
739 goto cleanup;
740 memcpy(ht, m, sizeof(struct hbucket) +
741 m->size * dsize);
742 ht->size = m->size + AHASH_INIT_SIZE;
743 t->hregion[nr].ext_size +=
744 ext_size(AHASH_INIT_SIZE,
745 dsize);
746 kfree(m);
747 m = ht;
748 RCU_INIT_POINTER(hbucket(t, key), ht);
749 }
750 d = ahash_data(m, m->pos, dsize);
751 memcpy(d, data, dsize);
752 set_bit(m->pos++, m->used);
753 t->hregion[nr].elements++;
754 #ifdef IP_SET_HASH_WITH_NETS
755 mtype_data_reset_flags(d, &flags);
756 #endif
757 }
758 }
759 rcu_read_unlock_bh();
760 }
761
762 /* There can't be any other writer. */
763 rcu_assign_pointer(h->table, t);
764
765 /* Give time to other readers of the set */
766 synchronize_rcu();
767
768 pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
769 orig->htable_bits, orig, t->htable_bits, t);
770 /* Add/delete elements processed by the SET target during resize.
771 * Kernel-side add cannot trigger a resize and userspace actions
772 * are serialized by the mutex.
773 */
774 list_for_each_safe(l, lt, &h->ad) {
775 x = list_entry(l, struct mtype_resize_ad, list);
776 if (x->ad == IPSET_ADD) {
777 mtype_add(set, &x->d, &x->ext, &x->mext, x->flags);
778 } else {
779 mtype_del(set, &x->d, NULL, NULL, 0);
780 }
781 list_del(l);
782 kfree(l);
783 }
784 /* If there's nobody else using the table, destroy it */
785 if (atomic_dec_and_test(&orig->uref)) {
786 pr_debug("Table destroy by resize %p\n", orig);
787 mtype_ahash_destroy(set, orig, false);
788 }
789
790 out:
791 #ifdef IP_SET_HASH_WITH_NETS
792 kfree(tmp);
793 #endif
794 return ret;
795
796 cleanup:
797 rcu_read_unlock_bh();
798 atomic_set(&orig->ref, 0);
799 atomic_dec(&orig->uref);
800 mtype_ahash_destroy(set, t, false);
801 if (ret == -EAGAIN)
802 goto retry;
803 goto out;
804
805 hbwarn:
806 /* In case we have plenty of memory :-) */
807 pr_warn("Cannot increase the hashsize of set %s further\n", set->name);
808 ret = -IPSET_ERR_HASH_FULL;
809 goto out;
810 }
811
812 /* Get the current number of elements and ext_size in the set */
813 static void
mtype_ext_size(struct ip_set * set,u32 * elements,size_t * ext_size)814 mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size)
815 {
816 struct htype *h = set->data;
817 const struct htable *t;
818 u32 i, j, r;
819 struct hbucket *n;
820 struct mtype_elem *data;
821
822 t = rcu_dereference_bh(h->table);
823 for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
824 for (i = ahash_bucket_start(r, t->htable_bits);
825 i < ahash_bucket_end(r, t->htable_bits); i++) {
826 n = rcu_dereference_bh(hbucket(t, i));
827 if (!n)
828 continue;
829 for (j = 0; j < n->pos; j++) {
830 if (!test_bit(j, n->used))
831 continue;
832 data = ahash_data(n, j, set->dsize);
833 if (!SET_ELEM_EXPIRED(set, data))
834 (*elements)++;
835 }
836 }
837 *ext_size += t->hregion[r].ext_size;
838 }
839 }
840
841 /* Add an element to a hash and update the internal counters when succeeded,
842 * otherwise report the proper error code.
843 */
844 static int
mtype_add(struct ip_set * set,void * value,const struct ip_set_ext * ext,struct ip_set_ext * mext,u32 flags)845 mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
846 struct ip_set_ext *mext, u32 flags)
847 {
848 struct htype *h = set->data;
849 struct htable *t;
850 const struct mtype_elem *d = value;
851 struct mtype_elem *data;
852 struct hbucket *n, *old = ERR_PTR(-ENOENT);
853 int i, j = -1, ret;
854 bool flag_exist = flags & IPSET_FLAG_EXIST;
855 bool deleted = false, forceadd = false, reuse = false;
856 u32 r, key, multi = 0, elements, maxelem;
857
858 rcu_read_lock_bh();
859 t = rcu_dereference_bh(h->table);
860 key = HKEY(value, h->initval, t->htable_bits);
861 r = ahash_region(key, t->htable_bits);
862 atomic_inc(&t->uref);
863 elements = t->hregion[r].elements;
864 maxelem = t->maxelem;
865 if (elements >= maxelem) {
866 u32 e;
867 if (SET_WITH_TIMEOUT(set)) {
868 rcu_read_unlock_bh();
869 mtype_gc_do(set, h, t, r);
870 rcu_read_lock_bh();
871 }
872 maxelem = h->maxelem;
873 elements = 0;
874 for (e = 0; e < ahash_numof_locks(t->htable_bits); e++)
875 elements += t->hregion[e].elements;
876 if (elements >= maxelem && SET_WITH_FORCEADD(set))
877 forceadd = true;
878 }
879 rcu_read_unlock_bh();
880
881 spin_lock_bh(&t->hregion[r].lock);
882 n = rcu_dereference_bh(hbucket(t, key));
883 if (!n) {
884 if (forceadd || elements >= maxelem)
885 goto set_full;
886 old = NULL;
887 n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
888 GFP_ATOMIC);
889 if (!n) {
890 ret = -ENOMEM;
891 goto unlock;
892 }
893 n->size = AHASH_INIT_SIZE;
894 t->hregion[r].ext_size +=
895 ext_size(AHASH_INIT_SIZE, set->dsize);
896 goto copy_elem;
897 }
898 for (i = 0; i < n->pos; i++) {
899 if (!test_bit(i, n->used)) {
900 /* Reuse first deleted entry */
901 if (j == -1) {
902 deleted = reuse = true;
903 j = i;
904 }
905 continue;
906 }
907 data = ahash_data(n, i, set->dsize);
908 if (mtype_data_equal(data, d, &multi)) {
909 if (flag_exist || SET_ELEM_EXPIRED(set, data)) {
910 /* Just the extensions could be overwritten */
911 j = i;
912 goto overwrite_extensions;
913 }
914 ret = -IPSET_ERR_EXIST;
915 goto unlock;
916 }
917 /* Reuse first timed out entry */
918 if (SET_ELEM_EXPIRED(set, data) && j == -1) {
919 j = i;
920 reuse = true;
921 }
922 }
923 if (reuse || forceadd) {
924 if (j == -1)
925 j = 0;
926 data = ahash_data(n, j, set->dsize);
927 if (!deleted) {
928 #ifdef IP_SET_HASH_WITH_NETS
929 for (i = 0; i < IPSET_NET_COUNT; i++)
930 mtype_del_cidr(set, h,
931 NCIDR_PUT(DCIDR_GET(data->cidr, i)),
932 i);
933 #endif
934 ip_set_ext_destroy(set, data);
935 t->hregion[r].elements--;
936 }
937 goto copy_data;
938 }
939 if (elements >= maxelem)
940 goto set_full;
941 /* Create a new slot */
942 if (n->pos >= n->size) {
943 TUNE_AHASH_MAX(h, multi);
944 if (n->size >= AHASH_MAX(h)) {
945 /* Trigger rehashing */
946 mtype_data_next(&h->next, d);
947 ret = -EAGAIN;
948 goto resize;
949 }
950 old = n;
951 n = kzalloc(sizeof(*n) +
952 (old->size + AHASH_INIT_SIZE) * set->dsize,
953 GFP_ATOMIC);
954 if (!n) {
955 ret = -ENOMEM;
956 goto unlock;
957 }
958 memcpy(n, old, sizeof(struct hbucket) +
959 old->size * set->dsize);
960 n->size = old->size + AHASH_INIT_SIZE;
961 t->hregion[r].ext_size +=
962 ext_size(AHASH_INIT_SIZE, set->dsize);
963 }
964
965 copy_elem:
966 j = n->pos++;
967 data = ahash_data(n, j, set->dsize);
968 copy_data:
969 t->hregion[r].elements++;
970 #ifdef IP_SET_HASH_WITH_NETS
971 for (i = 0; i < IPSET_NET_COUNT; i++)
972 mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
973 #endif
974 memcpy(data, d, sizeof(struct mtype_elem));
975 overwrite_extensions:
976 #ifdef IP_SET_HASH_WITH_NETS
977 mtype_data_set_flags(data, flags);
978 #endif
979 if (SET_WITH_COUNTER(set))
980 ip_set_init_counter(ext_counter(data, set), ext);
981 if (SET_WITH_COMMENT(set))
982 ip_set_init_comment(set, ext_comment(data, set), ext);
983 if (SET_WITH_SKBINFO(set))
984 ip_set_init_skbinfo(ext_skbinfo(data, set), ext);
985 /* Must come last for the case when timed out entry is reused */
986 if (SET_WITH_TIMEOUT(set))
987 ip_set_timeout_set(ext_timeout(data, set), ext->timeout);
988 smp_mb__before_atomic();
989 set_bit(j, n->used);
990 if (old != ERR_PTR(-ENOENT)) {
991 rcu_assign_pointer(hbucket(t, key), n);
992 if (old)
993 kfree_rcu(old, rcu);
994 }
995 ret = 0;
996 resize:
997 spin_unlock_bh(&t->hregion[r].lock);
998 if (atomic_read(&t->ref) && ext->target) {
999 /* Resize is in process and kernel side add, save values */
1000 struct mtype_resize_ad *x;
1001
1002 x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC);
1003 if (!x)
1004 /* Don't bother */
1005 goto out;
1006 x->ad = IPSET_ADD;
1007 memcpy(&x->d, value, sizeof(struct mtype_elem));
1008 memcpy(&x->ext, ext, sizeof(struct ip_set_ext));
1009 memcpy(&x->mext, mext, sizeof(struct ip_set_ext));
1010 x->flags = flags;
1011 spin_lock_bh(&set->lock);
1012 list_add_tail(&x->list, &h->ad);
1013 spin_unlock_bh(&set->lock);
1014 }
1015 goto out;
1016
1017 set_full:
1018 if (net_ratelimit())
1019 pr_warn("Set %s is full, maxelem %u reached\n",
1020 set->name, maxelem);
1021 ret = -IPSET_ERR_HASH_FULL;
1022 unlock:
1023 spin_unlock_bh(&t->hregion[r].lock);
1024 out:
1025 if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
1026 pr_debug("Table destroy after resize by add: %p\n", t);
1027 mtype_ahash_destroy(set, t, false);
1028 }
1029 return ret;
1030 }
1031
1032 /* Delete an element from the hash and free up space if possible.
1033 */
1034 static int
mtype_del(struct ip_set * set,void * value,const struct ip_set_ext * ext,struct ip_set_ext * mext,u32 flags)1035 mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
1036 struct ip_set_ext *mext, u32 flags)
1037 {
1038 struct htype *h = set->data;
1039 struct htable *t;
1040 const struct mtype_elem *d = value;
1041 struct mtype_elem *data;
1042 struct hbucket *n;
1043 struct mtype_resize_ad *x = NULL;
1044 int i, j, k, r, ret = -IPSET_ERR_EXIST;
1045 u32 key, multi = 0;
1046 size_t dsize = set->dsize;
1047
1048 /* Userspace add and resize is excluded by the mutex.
1049 * Kernespace add does not trigger resize.
1050 */
1051 rcu_read_lock_bh();
1052 t = rcu_dereference_bh(h->table);
1053 key = HKEY(value, h->initval, t->htable_bits);
1054 r = ahash_region(key, t->htable_bits);
1055 atomic_inc(&t->uref);
1056 rcu_read_unlock_bh();
1057
1058 spin_lock_bh(&t->hregion[r].lock);
1059 n = rcu_dereference_bh(hbucket(t, key));
1060 if (!n)
1061 goto out;
1062 for (i = 0, k = 0; i < n->pos; i++) {
1063 if (!test_bit(i, n->used)) {
1064 k++;
1065 continue;
1066 }
1067 data = ahash_data(n, i, dsize);
1068 if (!mtype_data_equal(data, d, &multi))
1069 continue;
1070 if (SET_ELEM_EXPIRED(set, data))
1071 goto out;
1072
1073 ret = 0;
1074 clear_bit(i, n->used);
1075 smp_mb__after_atomic();
1076 if (i + 1 == n->pos)
1077 n->pos--;
1078 t->hregion[r].elements--;
1079 #ifdef IP_SET_HASH_WITH_NETS
1080 for (j = 0; j < IPSET_NET_COUNT; j++)
1081 mtype_del_cidr(set, h,
1082 NCIDR_PUT(DCIDR_GET(d->cidr, j)), j);
1083 #endif
1084 ip_set_ext_destroy(set, data);
1085
1086 if (atomic_read(&t->ref) && ext->target) {
1087 /* Resize is in process and kernel side del,
1088 * save values
1089 */
1090 x = kzalloc(sizeof(struct mtype_resize_ad),
1091 GFP_ATOMIC);
1092 if (x) {
1093 x->ad = IPSET_DEL;
1094 memcpy(&x->d, value,
1095 sizeof(struct mtype_elem));
1096 x->flags = flags;
1097 }
1098 }
1099 for (; i < n->pos; i++) {
1100 if (!test_bit(i, n->used))
1101 k++;
1102 }
1103 if (n->pos == 0 && k == 0) {
1104 t->hregion[r].ext_size -= ext_size(n->size, dsize);
1105 rcu_assign_pointer(hbucket(t, key), NULL);
1106 kfree_rcu(n, rcu);
1107 } else if (k >= AHASH_INIT_SIZE) {
1108 struct hbucket *tmp = kzalloc(sizeof(*tmp) +
1109 (n->size - AHASH_INIT_SIZE) * dsize,
1110 GFP_ATOMIC);
1111 if (!tmp)
1112 goto out;
1113 tmp->size = n->size - AHASH_INIT_SIZE;
1114 for (j = 0, k = 0; j < n->pos; j++) {
1115 if (!test_bit(j, n->used))
1116 continue;
1117 data = ahash_data(n, j, dsize);
1118 memcpy(tmp->value + k * dsize, data, dsize);
1119 set_bit(k, tmp->used);
1120 k++;
1121 }
1122 tmp->pos = k;
1123 t->hregion[r].ext_size -=
1124 ext_size(AHASH_INIT_SIZE, dsize);
1125 rcu_assign_pointer(hbucket(t, key), tmp);
1126 kfree_rcu(n, rcu);
1127 }
1128 goto out;
1129 }
1130
1131 out:
1132 spin_unlock_bh(&t->hregion[r].lock);
1133 if (x) {
1134 spin_lock_bh(&set->lock);
1135 list_add(&x->list, &h->ad);
1136 spin_unlock_bh(&set->lock);
1137 }
1138 if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
1139 pr_debug("Table destroy after resize by del: %p\n", t);
1140 mtype_ahash_destroy(set, t, false);
1141 }
1142 return ret;
1143 }
1144
1145 static int
mtype_data_match(struct mtype_elem * data,const struct ip_set_ext * ext,struct ip_set_ext * mext,struct ip_set * set,u32 flags)1146 mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
1147 struct ip_set_ext *mext, struct ip_set *set, u32 flags)
1148 {
1149 if (!ip_set_match_extensions(set, ext, mext, flags, data))
1150 return 0;
1151 /* nomatch entries return -ENOTEMPTY */
1152 return mtype_do_data_match(data);
1153 }
1154
1155 #ifdef IP_SET_HASH_WITH_NETS
1156 /* Special test function which takes into account the different network
1157 * sizes added to the set
1158 */
1159 static int
mtype_test_cidrs(struct ip_set * set,struct mtype_elem * d,const struct ip_set_ext * ext,struct ip_set_ext * mext,u32 flags)1160 mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
1161 const struct ip_set_ext *ext,
1162 struct ip_set_ext *mext, u32 flags)
1163 {
1164 struct htype *h = set->data;
1165 struct htable *t = rcu_dereference_bh(h->table);
1166 struct hbucket *n;
1167 struct mtype_elem *data;
1168 #if IPSET_NET_COUNT == 2
1169 struct mtype_elem orig = *d;
1170 int ret, i, j = 0, k;
1171 #else
1172 int ret, i, j = 0;
1173 #endif
1174 u32 key, multi = 0;
1175
1176 pr_debug("test by nets\n");
1177 for (; j < NLEN && h->nets[j].cidr[0] && !multi; j++) {
1178 #if IPSET_NET_COUNT == 2
1179 mtype_data_reset_elem(d, &orig);
1180 mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false);
1181 for (k = 0; k < NLEN && h->nets[k].cidr[1] && !multi;
1182 k++) {
1183 mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]),
1184 true);
1185 #else
1186 mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]));
1187 #endif
1188 key = HKEY(d, h->initval, t->htable_bits);
1189 n = rcu_dereference_bh(hbucket(t, key));
1190 if (!n)
1191 continue;
1192 for (i = 0; i < n->pos; i++) {
1193 if (!test_bit(i, n->used))
1194 continue;
1195 data = ahash_data(n, i, set->dsize);
1196 if (!mtype_data_equal(data, d, &multi))
1197 continue;
1198 ret = mtype_data_match(data, ext, mext, set, flags);
1199 if (ret != 0)
1200 return ret;
1201 #ifdef IP_SET_HASH_WITH_MULTI
1202 /* No match, reset multiple match flag */
1203 multi = 0;
1204 #endif
1205 }
1206 #if IPSET_NET_COUNT == 2
1207 }
1208 #endif
1209 }
1210 return 0;
1211 }
1212 #endif
1213
1214 /* Test whether the element is added to the set */
1215 static int
mtype_test(struct ip_set * set,void * value,const struct ip_set_ext * ext,struct ip_set_ext * mext,u32 flags)1216 mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
1217 struct ip_set_ext *mext, u32 flags)
1218 {
1219 struct htype *h = set->data;
1220 struct htable *t;
1221 struct mtype_elem *d = value;
1222 struct hbucket *n;
1223 struct mtype_elem *data;
1224 int i, ret = 0;
1225 u32 key, multi = 0;
1226
1227 rcu_read_lock_bh();
1228 t = rcu_dereference_bh(h->table);
1229 #ifdef IP_SET_HASH_WITH_NETS
1230 /* If we test an IP address and not a network address,
1231 * try all possible network sizes
1232 */
1233 for (i = 0; i < IPSET_NET_COUNT; i++)
1234 if (DCIDR_GET(d->cidr, i) != HOST_MASK)
1235 break;
1236 if (i == IPSET_NET_COUNT) {
1237 ret = mtype_test_cidrs(set, d, ext, mext, flags);
1238 goto out;
1239 }
1240 #endif
1241
1242 key = HKEY(d, h->initval, t->htable_bits);
1243 n = rcu_dereference_bh(hbucket(t, key));
1244 if (!n) {
1245 ret = 0;
1246 goto out;
1247 }
1248 for (i = 0; i < n->pos; i++) {
1249 if (!test_bit(i, n->used))
1250 continue;
1251 data = ahash_data(n, i, set->dsize);
1252 if (!mtype_data_equal(data, d, &multi))
1253 continue;
1254 ret = mtype_data_match(data, ext, mext, set, flags);
1255 if (ret != 0)
1256 goto out;
1257 }
1258 out:
1259 rcu_read_unlock_bh();
1260 return ret;
1261 }
1262
1263 /* Reply a HEADER request: fill out the header part of the set */
1264 static int
mtype_head(struct ip_set * set,struct sk_buff * skb)1265 mtype_head(struct ip_set *set, struct sk_buff *skb)
1266 {
1267 struct htype *h = set->data;
1268 const struct htable *t;
1269 struct nlattr *nested;
1270 size_t memsize;
1271 u32 elements = 0;
1272 size_t ext_size = 0;
1273 u8 htable_bits;
1274
1275 rcu_read_lock_bh();
1276 t = rcu_dereference_bh(h->table);
1277 mtype_ext_size(set, &elements, &ext_size);
1278 memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size;
1279 htable_bits = t->htable_bits;
1280 rcu_read_unlock_bh();
1281
1282 nested = nla_nest_start(skb, IPSET_ATTR_DATA);
1283 if (!nested)
1284 goto nla_put_failure;
1285 if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE,
1286 htonl(jhash_size(htable_bits))) ||
1287 nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
1288 goto nla_put_failure;
1289 #ifdef IP_SET_HASH_WITH_NETMASK
1290 if (h->netmask != HOST_MASK &&
1291 nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
1292 goto nla_put_failure;
1293 #endif
1294 #ifdef IP_SET_HASH_WITH_MARKMASK
1295 if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
1296 goto nla_put_failure;
1297 #endif
1298 if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
1299 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
1300 nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
1301 goto nla_put_failure;
1302 if (unlikely(ip_set_put_flags(skb, set)))
1303 goto nla_put_failure;
1304 nla_nest_end(skb, nested);
1305
1306 return 0;
1307 nla_put_failure:
1308 return -EMSGSIZE;
1309 }
1310
1311 /* Make possible to run dumping parallel with resizing */
1312 static void
mtype_uref(struct ip_set * set,struct netlink_callback * cb,bool start)1313 mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start)
1314 {
1315 struct htype *h = set->data;
1316 struct htable *t;
1317
1318 if (start) {
1319 rcu_read_lock_bh();
1320 t = ipset_dereference_bh_nfnl(h->table);
1321 atomic_inc(&t->uref);
1322 cb->args[IPSET_CB_PRIVATE] = (unsigned long)t;
1323 rcu_read_unlock_bh();
1324 } else if (cb->args[IPSET_CB_PRIVATE]) {
1325 t = (struct htable *)cb->args[IPSET_CB_PRIVATE];
1326 if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
1327 pr_debug("Table destroy after resize "
1328 " by dump: %p\n", t);
1329 mtype_ahash_destroy(set, t, false);
1330 }
1331 cb->args[IPSET_CB_PRIVATE] = 0;
1332 }
1333 }
1334
1335 /* Reply a LIST/SAVE request: dump the elements of the specified set */
1336 static int
mtype_list(const struct ip_set * set,struct sk_buff * skb,struct netlink_callback * cb)1337 mtype_list(const struct ip_set *set,
1338 struct sk_buff *skb, struct netlink_callback *cb)
1339 {
1340 const struct htable *t;
1341 struct nlattr *atd, *nested;
1342 const struct hbucket *n;
1343 const struct mtype_elem *e;
1344 u32 first = cb->args[IPSET_CB_ARG0];
1345 /* We assume that one hash bucket fills into one page */
1346 void *incomplete;
1347 int i, ret = 0;
1348
1349 atd = nla_nest_start(skb, IPSET_ATTR_ADT);
1350 if (!atd)
1351 return -EMSGSIZE;
1352
1353 pr_debug("list hash set %s\n", set->name);
1354 t = (const struct htable *)cb->args[IPSET_CB_PRIVATE];
1355 /* Expire may replace a hbucket with another one */
1356 rcu_read_lock();
1357 for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits);
1358 cb->args[IPSET_CB_ARG0]++) {
1359 cond_resched_rcu();
1360 incomplete = skb_tail_pointer(skb);
1361 n = rcu_dereference(hbucket(t, cb->args[IPSET_CB_ARG0]));
1362 pr_debug("cb->arg bucket: %lu, t %p n %p\n",
1363 cb->args[IPSET_CB_ARG0], t, n);
1364 if (!n)
1365 continue;
1366 for (i = 0; i < n->pos; i++) {
1367 if (!test_bit(i, n->used))
1368 continue;
1369 e = ahash_data(n, i, set->dsize);
1370 if (SET_ELEM_EXPIRED(set, e))
1371 continue;
1372 pr_debug("list hash %lu hbucket %p i %u, data %p\n",
1373 cb->args[IPSET_CB_ARG0], n, i, e);
1374 nested = nla_nest_start(skb, IPSET_ATTR_DATA);
1375 if (!nested) {
1376 if (cb->args[IPSET_CB_ARG0] == first) {
1377 nla_nest_cancel(skb, atd);
1378 ret = -EMSGSIZE;
1379 goto out;
1380 }
1381 goto nla_put_failure;
1382 }
1383 if (mtype_data_list(skb, e))
1384 goto nla_put_failure;
1385 if (ip_set_put_extensions(skb, set, e, true))
1386 goto nla_put_failure;
1387 nla_nest_end(skb, nested);
1388 }
1389 }
1390 nla_nest_end(skb, atd);
1391 /* Set listing finished */
1392 cb->args[IPSET_CB_ARG0] = 0;
1393
1394 goto out;
1395
1396 nla_put_failure:
1397 nlmsg_trim(skb, incomplete);
1398 if (unlikely(first == cb->args[IPSET_CB_ARG0])) {
1399 pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n",
1400 set->name);
1401 cb->args[IPSET_CB_ARG0] = 0;
1402 ret = -EMSGSIZE;
1403 } else {
1404 nla_nest_end(skb, atd);
1405 }
1406 out:
1407 rcu_read_unlock();
1408 return ret;
1409 }
1410
1411 static int
1412 IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
1413 const struct xt_action_param *par,
1414 enum ipset_adt adt, struct ip_set_adt_opt *opt);
1415
1416 static int
1417 IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
1418 enum ipset_adt adt, u32 *lineno, u32 flags,
1419 bool retried);
1420
1421 static const struct ip_set_type_variant mtype_variant = {
1422 .kadt = mtype_kadt,
1423 .uadt = mtype_uadt,
1424 .adt = {
1425 [IPSET_ADD] = mtype_add,
1426 [IPSET_DEL] = mtype_del,
1427 [IPSET_TEST] = mtype_test,
1428 },
1429 .destroy = mtype_destroy,
1430 .flush = mtype_flush,
1431 .head = mtype_head,
1432 .list = mtype_list,
1433 .uref = mtype_uref,
1434 .resize = mtype_resize,
1435 .same_set = mtype_same_set,
1436 .region_lock = true,
1437 };
1438
1439 #ifdef IP_SET_EMIT_CREATE
1440 static int
IPSET_TOKEN(HTYPE,_create)1441 IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
1442 struct nlattr *tb[], u32 flags)
1443 {
1444 u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
1445 #ifdef IP_SET_HASH_WITH_MARKMASK
1446 u32 markmask;
1447 #endif
1448 u8 hbits;
1449 #ifdef IP_SET_HASH_WITH_NETMASK
1450 u8 netmask;
1451 #endif
1452 size_t hsize;
1453 struct htype *h;
1454 struct htable *t;
1455 u32 i;
1456
1457 pr_debug("Create set %s with family %s\n",
1458 set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
1459
1460 #ifdef IP_SET_PROTO_UNDEF
1461 if (set->family != NFPROTO_UNSPEC)
1462 return -IPSET_ERR_INVALID_FAMILY;
1463 #else
1464 if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
1465 return -IPSET_ERR_INVALID_FAMILY;
1466 #endif
1467
1468 if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
1469 !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
1470 !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
1471 !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
1472 return -IPSET_ERR_PROTOCOL;
1473
1474 #ifdef IP_SET_HASH_WITH_MARKMASK
1475 /* Separated condition in order to avoid directive in argument list */
1476 if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK)))
1477 return -IPSET_ERR_PROTOCOL;
1478
1479 markmask = 0xffffffff;
1480 if (tb[IPSET_ATTR_MARKMASK]) {
1481 markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK]));
1482 if (markmask == 0)
1483 return -IPSET_ERR_INVALID_MARKMASK;
1484 }
1485 #endif
1486
1487 #ifdef IP_SET_HASH_WITH_NETMASK
1488 netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
1489 if (tb[IPSET_ATTR_NETMASK]) {
1490 netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
1491
1492 if ((set->family == NFPROTO_IPV4 && netmask > 32) ||
1493 (set->family == NFPROTO_IPV6 && netmask > 128) ||
1494 netmask == 0)
1495 return -IPSET_ERR_INVALID_NETMASK;
1496 }
1497 #endif
1498
1499 if (tb[IPSET_ATTR_HASHSIZE]) {
1500 hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
1501 if (hashsize < IPSET_MIMINAL_HASHSIZE)
1502 hashsize = IPSET_MIMINAL_HASHSIZE;
1503 }
1504
1505 if (tb[IPSET_ATTR_MAXELEM])
1506 maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
1507
1508 hsize = sizeof(*h);
1509 h = kzalloc(hsize, GFP_KERNEL);
1510 if (!h)
1511 return -ENOMEM;
1512
1513 /* Compute htable_bits from the user input parameter hashsize.
1514 * Assume that hashsize == 2^htable_bits,
1515 * otherwise round up to the first 2^n value.
1516 */
1517 hbits = fls(hashsize - 1);
1518 hsize = htable_size(hbits);
1519 if (hsize == 0) {
1520 kfree(h);
1521 return -ENOMEM;
1522 }
1523 t = ip_set_alloc(hsize);
1524 if (!t) {
1525 kfree(h);
1526 return -ENOMEM;
1527 }
1528 t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits));
1529 if (!t->hregion) {
1530 ip_set_free(t);
1531 kfree(h);
1532 return -ENOMEM;
1533 }
1534 h->gc.set = set;
1535 for (i = 0; i < ahash_numof_locks(hbits); i++)
1536 spin_lock_init(&t->hregion[i].lock);
1537 h->maxelem = maxelem;
1538 #ifdef IP_SET_HASH_WITH_NETMASK
1539 h->netmask = netmask;
1540 #endif
1541 #ifdef IP_SET_HASH_WITH_MARKMASK
1542 h->markmask = markmask;
1543 #endif
1544 get_random_bytes(&h->initval, sizeof(h->initval));
1545
1546 t->htable_bits = hbits;
1547 t->maxelem = h->maxelem / ahash_numof_locks(hbits);
1548 RCU_INIT_POINTER(h->table, t);
1549
1550 INIT_LIST_HEAD(&h->ad);
1551 set->data = h;
1552 #ifndef IP_SET_PROTO_UNDEF
1553 if (set->family == NFPROTO_IPV4) {
1554 #endif
1555 set->variant = &IPSET_TOKEN(HTYPE, 4_variant);
1556 set->dsize = ip_set_elem_len(set, tb,
1557 sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)),
1558 __alignof__(struct IPSET_TOKEN(HTYPE, 4_elem)));
1559 #ifndef IP_SET_PROTO_UNDEF
1560 } else {
1561 set->variant = &IPSET_TOKEN(HTYPE, 6_variant);
1562 set->dsize = ip_set_elem_len(set, tb,
1563 sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)),
1564 __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem)));
1565 }
1566 #endif
1567 set->timeout = IPSET_NO_TIMEOUT;
1568 if (tb[IPSET_ATTR_TIMEOUT]) {
1569 set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
1570 #ifndef IP_SET_PROTO_UNDEF
1571 if (set->family == NFPROTO_IPV4)
1572 #endif
1573 IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc);
1574 #ifndef IP_SET_PROTO_UNDEF
1575 else
1576 IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc);
1577 #endif
1578 }
1579 pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
1580 set->name, jhash_size(t->htable_bits),
1581 t->htable_bits, h->maxelem, set->data, t);
1582
1583 return 0;
1584 }
1585 #endif /* IP_SET_EMIT_CREATE */
1586
1587 #undef HKEY_DATALEN
1588