1# Copyright The Cloud Custodian Authors.
2# SPDX-License-Identifier: Apache-2.0
3"""
4Resource Filtering Logic
5"""
6import copy
7import datetime
8from datetime import timedelta
9import fnmatch
10import ipaddress
11import logging
12import operator
13import re
14
15from dateutil.tz import tzutc
16from dateutil.parser import parse
17from c7n.vendored.distutils import version
18from random import sample
19
20from c7n.element import Element
21from c7n.exceptions import PolicyValidationError, PolicyExecutionError
22from c7n.manager import ResourceManager
23from c7n.registry import PluginRegistry
24from c7n.resolver import ValuesFrom
25from c7n.utils import (
26 set_annotation,
27 type_schema,
28 parse_cidr,
29 parse_date,
30 jmespath_search,
31 jmespath_compile
32)
33from c7n.manager import iter_filters
34
35
36class FilterValidationError(Exception):
37 pass
38
39
40# Matching filters annotate their key onto objects
41ANNOTATION_KEY = "c7n:MatchedFilters"
42
43
44def glob_match(value, pattern):
45 if not isinstance(value, str):
46 return False
47 return fnmatch.fnmatch(value, pattern)
48
49
50def regex_match(value, regex):
51 if not isinstance(value, str):
52 return False
53 # Note python 2.5+ internally cache regex
54 # would be nice to use re2
55 return bool(re.match(regex, value, flags=re.IGNORECASE))
56
57
58def regex_case_sensitive_match(value, regex):
59 if not isinstance(value, str):
60 return False
61 # Note python 2.5+ internally cache regex
62 # would be nice to use re2
63 return bool(re.match(regex, value))
64
65
66def operator_in(x, y):
67 return x in y
68
69
70def operator_ni(x, y):
71 return x not in y
72
73
74def difference(x, y):
75 return bool(set(x).difference(y))
76
77
78def intersect(x, y):
79 return bool(set(x).intersection(y))
80
81
82def mod(x, y):
83 return bool(x % y)
84
85
86OPERATORS = {
87 'eq': operator.eq,
88 'equal': operator.eq,
89 'ne': operator.ne,
90 'not-equal': operator.ne,
91 'gt': operator.gt,
92 'greater-than': operator.gt,
93 'ge': operator.ge,
94 'gte': operator.ge,
95 'le': operator.le,
96 'lte': operator.le,
97 'lt': operator.lt,
98 'less-than': operator.lt,
99 'glob': glob_match,
100 'regex': regex_match,
101 'regex-case': regex_case_sensitive_match,
102 'in': operator_in,
103 'ni': operator_ni,
104 'not-in': operator_ni,
105 'contains': operator.contains,
106 'difference': difference,
107 'intersect': intersect,
108 'mod': mod}
109
110
111VALUE_TYPES = [
112 'age', 'integer', 'expiration', 'normalize', 'size',
113 'cidr', 'cidr_size', 'swap', 'resource_count', 'expr',
114 'unique_size', 'date', 'version', 'float']
115
116
117class FilterRegistry(PluginRegistry):
118
119 value_filter_class = None
120
121 def __init__(self, *args, **kw):
122 super().__init__(*args, **kw)
123 self.register('value', ValueFilter)
124 self.register('or', Or)
125 self.register('and', And)
126 self.register('not', Not)
127 self.register('event', EventFilter)
128 self.register('reduce', ReduceFilter)
129 self.register('list-item', ListItemFilter)
130
131 def parse(self, data, manager):
132 results = []
133 for d in data:
134 results.append(self.factory(d, manager))
135 return results
136
137 def factory(self, data, manager=None):
138 """Factory func for filters.
139
140 data - policy config for filters
141 manager - resource type manager (ec2, s3, etc)
142 """
143
144 # Make the syntax a little nicer for common cases.
145 if isinstance(data, dict) and len(data) == 1 and 'type' not in data:
146 op = list(data.keys())[0]
147 if op == 'or':
148 return self['or'](data, self, manager)
149 elif op == 'and':
150 return self['and'](data, self, manager)
151 elif op == 'not':
152 return self['not'](data, self, manager)
153 return self.value_filter_class(data, manager)
154 if isinstance(data, str):
155 filter_type = data
156 data = {'type': data}
157 else:
158 filter_type = data.get('type')
159 if not filter_type:
160 raise PolicyValidationError(
161 "%s Invalid Filter %s" % (
162 self.plugin_type, data))
163 filter_class = self.get(filter_type)
164 if filter_class is not None:
165 return filter_class(data, manager)
166 else:
167 raise PolicyValidationError(
168 "%s Invalid filter type %s" % (
169 self.plugin_type, data))
170
171
172def trim_runtime(filters):
173 """Remove runtime filters.
174
175 Some filters can only be effectively evaluated at policy
176 execution, ie. event filters.
177
178 When evaluating conditions for dryrun or provisioning stages we
179 remove them.
180 """
181 def remove_filter(f):
182 block = f.get_block_parent()
183 block.filters.remove(f)
184 if isinstance(block, BooleanGroupFilter) and not len(block):
185 remove_filter(block)
186
187 for f in iter_filters(filters):
188 if isinstance(f, EventFilter):
189 remove_filter(f)
190
191
192# Really should be an abstract base class (abc) or
193# zope.interface
194
195class Filter(Element):
196
197 log = logging.getLogger('custodian.filters')
198
199 def __init__(self, data, manager=None):
200 self.data = data
201 self.manager = manager
202
203 def process(self, resources, event=None):
204 """ Bulk process resources and return filtered set."""
205 return list(filter(self, resources))
206
207 def get_block_operator(self):
208 """Determine the immediate parent boolean operator for a filter"""
209 # Top level operator is `and`
210 block = self.get_block_parent()
211 if block.type in ('and', 'or', 'not'):
212 return block.type
213 return 'and'
214
215 def get_block_parent(self):
216 """Get the block parent for a filter"""
217 block_stack = [self.manager]
218 for f in self.manager.iter_filters(block_end=True):
219 if f is None:
220 block_stack.pop()
221 elif f == self:
222 return block_stack[-1]
223 elif f.type in ('and', 'or', 'not'):
224 block_stack.append(f)
225
226 def merge_annotation(self, r, annotation_key, values):
227 block_op = self.get_block_operator()
228 if block_op in ('and', 'not'):
229 r[self.matched_annotation_key] = intersect_list(
230 values,
231 r.get(self.matched_annotation_key))
232 elif block_op == 'or':
233 r[self.matched_annotation_key] = union_list(
234 values,
235 r.get(self.matched_annotation_key))
236
237
238class BaseValueFilter(Filter):
239 expr = None
240
241 def __init__(self, data, manager=None):
242 super(BaseValueFilter, self).__init__(data, manager)
243 self.expr = {}
244
245 def get_resource_value(self, k, i, regex=None):
246 r = None
247 if k.startswith('tag:'):
248 tk = k.split(':', 1)[1]
249 if 'Tags' in i:
250 for t in i.get("Tags", []):
251 if t.get('Key') == tk:
252 r = t.get('Value')
253 break
254 # GCP schema: 'labels': {'key': 'value'}
255 elif 'labels' in i:
256 r = i.get('labels', {}).get(tk, None)
257 # GCP has a secondary form of labels called tags
258 # as labels without values.
259 # Azure schema: 'tags': {'key': 'value'}
260 elif 'tags' in i:
261 r = (i.get('tags', {}) or {}).get(tk, None)
262 elif k in i:
263 r = i.get(k)
264 elif k not in self.expr:
265 self.expr[k] = jmespath_compile(k)
266 r = self.expr[k].search(i)
267 else:
268 r = self.expr[k].search(i)
269
270 if regex:
271 r = ValueRegex(regex).get_resource_value(r)
272 return r
273
274 def _validate_value_regex(self, regex):
275 """Specific validation for `value_regex` type
276
277 The `value_regex` type works a little differently. In
278 particular it doesn't support OPERATORS that perform
279 operations on a list of values, specifically 'intersect',
280 'contains', 'difference', 'in' and 'not-in'
281 """
282 # Sanity check that we can compile
283 try:
284 pattern = re.compile(regex)
285 if pattern.groups != 1:
286 raise PolicyValidationError(
287 "value_regex must have a single capturing group: %s" %
288 self.data)
289 except re.error as e:
290 raise PolicyValidationError(
291 "Invalid value_regex: %s %s" % (e, self.data))
292 return self
293
294
295def intersect_list(a, b):
296 if b is None:
297 return a
298 elif a is None:
299 return b
300 res = []
301 for x in a:
302 if x in b:
303 res.append(x)
304 return res
305
306
307def union_list(a, b):
308 if not b:
309 return a
310 if not a:
311 return b
312 res = a
313 res.extend(x for x in b if x not in a)
314 return res
315
316
317class BooleanGroupFilter(Filter):
318
319 def __init__(self, data, registry, manager):
320 super(BooleanGroupFilter, self).__init__(data)
321 self.registry = registry
322 self.filters = registry.parse(list(self.data.values())[0], manager)
323 self.manager = manager
324
325 def validate(self):
326 for f in self.filters:
327 f.validate()
328 return self
329
330 def get_resource_type_id(self):
331 resource_type = self.manager.get_model()
332 return resource_type.id
333
334 def __len__(self):
335 return len(self.filters)
336
337 def __bool__(self):
338 return True
339
340 def get_deprecations(self):
341 """Return any matching deprecations for the nested filters."""
342 deprecations = []
343 for f in self.filters:
344 deprecations.extend(f.get_deprecations())
345 return deprecations
346
347
348class Or(BooleanGroupFilter):
349
350 def process(self, resources, event=None):
351 if self.manager:
352 return self.process_set(resources, event)
353 return super(Or, self).process(resources, event)
354
355 def __call__(self, r):
356 """Fallback for older unit tests that don't utilize a query manager"""
357 for f in self.filters:
358 if f(r):
359 return True
360 return False
361
362 def process_set(self, resources, event):
363 rtype_id = self.get_resource_type_id()
364 compiled = None
365 if '.' in rtype_id:
366 compiled = jmespath_compile(rtype_id)
367 resource_map = {compiled.search(r): r for r in resources}
368 else:
369 resource_map = {r[rtype_id]: r for r in resources}
370 results = set()
371 for f in self.filters:
372 if compiled:
373 results = results.union([
374 compiled.search(r) for r in f.process(resources, event)])
375 else:
376 results = results.union([
377 r[rtype_id] for r in f.process(resources, event)])
378 return [resource_map[r_id] for r_id in results]
379
380
381class And(BooleanGroupFilter):
382
383 def process(self, resources, events=None):
384 if self.manager:
385 sweeper = AnnotationSweeper(self.get_resource_type_id(), resources)
386
387 for f in self.filters:
388 resources = f.process(resources, events)
389 if not resources:
390 break
391
392 if self.manager:
393 sweeper.sweep(resources)
394
395 return resources
396
397
398class Not(BooleanGroupFilter):
399
400 def process(self, resources, event=None):
401 if self.manager:
402 return self.process_set(resources, event)
403 return super(Not, self).process(resources, event)
404
405 def __call__(self, r):
406 """Fallback for older unit tests that don't utilize a query manager"""
407
408 # There is an implicit 'and' for self.filters
409 # ~(A ^ B ^ ... ^ Z) = ~A v ~B v ... v ~Z
410 for f in self.filters:
411 if not f(r):
412 return True
413 return False
414
415 def process_set(self, resources, event):
416 rtype_id = self.get_resource_type_id()
417 compiled = None
418 if '.' in rtype_id:
419 compiled = jmespath_compile(rtype_id)
420 resource_map = {compiled.search(r): r for r in resources}
421 else:
422 resource_map = {r[rtype_id]: r for r in resources}
423 sweeper = AnnotationSweeper(rtype_id, resources)
424
425 for f in self.filters:
426 resources = f.process(resources, event)
427 if not resources:
428 break
429
430 before = set(resource_map.keys())
431 if compiled:
432 after = {compiled.search(r) for r in resources}
433 else:
434 after = {r[rtype_id] for r in resources}
435 results = before - after
436 sweeper.sweep([])
437
438 return [resource_map[r_id] for r_id in results]
439
440
441class AnnotationSweeper:
442 """Support clearing annotations set within a block filter.
443
444 See https://github.com/cloud-custodian/cloud-custodian/issues/2116
445 """
446 def __init__(self, id_key, resources):
447 self.id_key = id_key
448 ra_map = {}
449 resource_map = {}
450 compiled = None
451 if '.' in id_key:
452 compiled = jmespath_compile(self.id_key)
453 for r in resources:
454 if compiled:
455 id_ = compiled.search(r)
456 else:
457 id_ = r[self.id_key]
458 ra_map[id_] = {k: v for k, v in r.items() if k.startswith('c7n')}
459 resource_map[id_] = r
460 # We keep a full copy of the annotation keys to allow restore.
461 self.ra_map = copy.deepcopy(ra_map)
462 self.resource_map = resource_map
463
464 def sweep(self, resources):
465 compiled = None
466 if '.' in self.id_key:
467 compiled = jmespath_compile(self.id_key)
468 diff = set(self.ra_map).difference([compiled.search(r) for r in resources])
469 else:
470 diff = set(self.ra_map).difference([r[self.id_key] for r in resources])
471 for rid in diff:
472 # Clear annotations if the block filter didn't match
473 akeys = [k for k in self.resource_map[rid] if k.startswith('c7n')]
474 for k in akeys:
475 del self.resource_map[rid][k]
476 # Restore annotations that may have existed prior to the block filter.
477 self.resource_map[rid].update(self.ra_map[rid])
478
479
480# The default LooseVersion will fail on comparing present strings, used
481# in the value as shorthand for certain options.
482class ComparableVersion(version.LooseVersion):
483 def __eq__(self, other):
484 try:
485 return super(ComparableVersion, self).__eq__(other)
486 except TypeError:
487 return False
488
489
490class ValueFilter(BaseValueFilter):
491 """Generic value filter using jmespath
492 """
493 op = v = vtype = None
494
495 schema = {
496 'type': 'object',
497 # Doesn't mix well with inherits that extend
498 'additionalProperties': False,
499 'required': ['type'],
500 'properties': {
501 # Doesn't mix well as enum with inherits that extend
502 'type': {'enum': ['value']},
503 'key': {'type': 'string'},
504 'value_type': {'$ref': '#/definitions/filters_common/value_types'},
505 'default': {'type': 'object'},
506 'value_regex': {'type': 'string'},
507 'value_from': {'$ref': '#/definitions/filters_common/value_from'},
508 'value': {'$ref': '#/definitions/filters_common/value'},
509 'op': {'$ref': '#/definitions/filters_common/comparison_operators'},
510 'value_path': {'type': 'string'}
511 }
512 }
513 schema_alias = True
514 annotate = True
515 required_keys = {'value', 'key'}
516
517 def _validate_resource_count(self):
518 """ Specific validation for `resource_count` type
519
520 The `resource_count` type works a little differently because it operates
521 on the entire set of resources. It:
522 - does not require `key`
523 - `value` must be a number
524 - supports a subset of the OPERATORS list
525 """
526 for field in ('op', 'value'):
527 if field not in self.data:
528 raise PolicyValidationError(
529 "Missing '%s' in value filter %s" % (field, self.data))
530
531 if not (isinstance(self.data['value'], int) or
532 isinstance(self.data['value'], list)):
533 raise PolicyValidationError(
534 "`value` must be an integer in resource_count filter %s" % self.data)
535
536 # I don't see how to support regex for this?
537 if (self.data['op'] not in OPERATORS or
538 self.data['op'] in {'regex', 'regex-case'} or
539 'value_regex' in self.data):
540 raise PolicyValidationError(
541 "Invalid operator in value filter %s" % self.data)
542
543 return self
544
545 def validate(self):
546 if len(self.data) == 1:
547 return self
548
549 # `resource_count` requires a slightly different schema than the rest of
550 # the value filters because it operates on the full resource list
551 if self.data.get('value_type') == 'resource_count':
552 return self._validate_resource_count()
553 elif self.data.get('value_type') == 'date':
554 if not parse_date(self.data.get('value')):
555 raise PolicyValidationError(
556 "value_type: date with invalid date value:%s",
557 self.data.get('value', ''))
558 if 'key' not in self.data and 'key' in self.required_keys:
559 raise PolicyValidationError(
560 "Missing 'key' in value filter %s" % self.data)
561 if ('value' not in self.data and
562 'value_from' not in self.data and
563 'value_path' not in self.data and
564 'value' in self.required_keys):
565 raise PolicyValidationError(
566 "Missing 'value' in value filter %s" % self.data)
567 if 'op' in self.data:
568 if self.data['op'] not in OPERATORS:
569 raise PolicyValidationError(
570 "Invalid operator in value filter %s" % self.data)
571 if self.data['op'] in {'regex', 'regex-case'}:
572 # Sanity check that we can compile
573 try:
574 re.compile(self.data['value'])
575 except re.error as e:
576 raise PolicyValidationError(
577 "Invalid regex: %s %s" % (e, self.data))
578 if 'value_regex' in self.data:
579 return self._validate_value_regex(self.data['value_regex'])
580
581 return self
582
583 def __call__(self, i):
584 if self.data.get('value_type') == 'resource_count':
585 return self.process(i)
586
587 matched = self.match(i)
588 if matched and self.annotate:
589 set_annotation(i, ANNOTATION_KEY, self.k)
590 return matched
591
592 def process(self, resources, event=None):
593 # For the resource_count filter we operate on the full set of resources.
594 if self.data.get('value_type') == 'resource_count':
595 op = OPERATORS[self.data.get('op')]
596 if op(len(resources), self.data.get('value')):
597 return resources
598 return []
599
600 return super(ValueFilter, self).process(resources, event)
601
602 def get_resource_value(self, k, i):
603 return super(ValueFilter, self).get_resource_value(k, i, self.data.get('value_regex'))
604
605 def get_path_value(self, i):
606 """Retrieve values using JMESPath.
607
608 When using a Value Filter, a ``value_path`` can be specified.
609 This means the value(s) the filter will compare against are
610 calculated during the initialization of the filter.
611
612 Note that this option only pulls properties of the resource
613 currently being filtered.
614
615 .. code-block:: yaml
616 - name: find-admins-with-user-roles
617 resource: gcp.project
618 filters:
619 - type: iam-policy
620 doc:
621 key: bindings[?(role=='roles/admin')].members[]
622 op: intersect
623 value_path: bindings[?(role=='roles/user_access')].members[]
624
625 The iam-policy use the implementation of the generic Value Filter.
626 This implementation allows for the comparison of two separate lists of values
627 within the same resource.
628 """
629 return jmespath_search(self.data.get('value_path'), i)
630
631 def match(self, i):
632 if self.v is None and len(self.data) == 1:
633 [(self.k, self.v)] = self.data.items()
634 elif self.v is None and not hasattr(self, 'content_initialized'):
635 self.k = self.data.get('key')
636 self.op = self.data.get('op')
637 if 'value_from' in self.data:
638 values = ValuesFrom(self.data['value_from'], self.manager)
639 self.v = values.get_values()
640 elif 'value_path' in self.data:
641 self.v = self.get_path_value(i)
642 else:
643 self.v = self.data.get('value')
644 self.content_initialized = True
645 self.vtype = self.data.get('value_type')
646
647 if i is None:
648 return False
649
650 # value extract
651 r = self.get_resource_value(self.k, i)
652 if self.op in ('in', 'not-in') and r is None:
653 r = ()
654
655 # value type conversion
656 if self.vtype is not None:
657 v, r = self.process_value_type(self.v, r, i)
658 else:
659 v = self.v
660
661 # Value match
662 if r is None and v == 'absent':
663 return True
664 elif r is not None and v == 'present':
665 return True
666 elif v == 'not-null' and r:
667 return True
668 elif v == 'empty' and not r:
669 return True
670 elif self.op:
671 op = OPERATORS[self.op]
672 try:
673 return op(r, v)
674 except TypeError:
675 return False
676 elif r == v:
677 return True
678
679 return False
680
681 def process_value_type(self, sentinel, value, resource):
682 if self.vtype == 'normalize' and isinstance(value, str):
683 return sentinel, value.strip().lower()
684
685 elif self.vtype == 'expr':
686 sentinel = self.get_resource_value(sentinel, resource)
687 return sentinel, value
688
689 elif self.vtype == 'integer':
690 try:
691 value = int(str(value).strip())
692 except ValueError:
693 value = 0
694 elif self.vtype == 'float':
695 try:
696 value = float(str(value).strip())
697 except ValueError:
698 value = 0.0
699 elif self.vtype == 'size':
700 try:
701 return sentinel, len(value)
702 except TypeError:
703 return sentinel, 0
704 elif self.vtype == 'unique_size':
705 try:
706 return sentinel, len(set(value))
707 except TypeError:
708 return sentinel, 0
709 elif self.vtype == 'swap':
710 return value, sentinel
711 elif self.vtype == 'date':
712 return parse_date(sentinel), parse_date(value)
713 elif self.vtype == 'age':
714 if not isinstance(sentinel, datetime.datetime):
715 sentinel = datetime.datetime.now(tz=tzutc()) - timedelta(sentinel)
716 value = parse_date(value)
717 if value is None:
718 # compatiblity
719 value = 0
720 # Reverse the age comparison, we want to compare the value being
721 # greater than the sentinel typically. Else the syntax for age
722 # comparisons is intuitively wrong.
723 return value, sentinel
724 elif self.vtype == 'cidr':
725 s = parse_cidr(sentinel)
726 v = parse_cidr(value)
727 if (isinstance(s, ipaddress._BaseAddress) and isinstance(v, ipaddress._BaseNetwork)):
728 return v, s
729 return s, v
730 elif self.vtype == 'cidr_size':
731 cidr = parse_cidr(value)
732 if cidr:
733 return sentinel, cidr.prefixlen
734 return sentinel, 0
735
736 # Allows for expiration filtering, for events in the future as opposed
737 # to events in the past which age filtering allows for.
738 elif self.vtype == 'expiration':
739 if not isinstance(sentinel, datetime.datetime):
740 sentinel = datetime.datetime.now(tz=tzutc()) + timedelta(sentinel)
741 value = parse_date(value)
742 if value is None:
743 value = 0
744 return sentinel, value
745
746 # Allows for comparing version numbers, for things that you expect a minimum version number.
747 elif self.vtype == 'version':
748 s = ComparableVersion(sentinel)
749 v = ComparableVersion(value)
750 return s, v
751
752 return sentinel, value
753
754
755FilterRegistry.value_filter_class = ValueFilter
756
757
758class AgeFilter(Filter):
759 """Automatically filter resources older than a given date.
760
761 **Deprecated** use a value filter with `value_type: age` which can be
762 done on any attribute.
763 """
764 threshold_date = None
765
766 # The name of attribute to compare to threshold; must override in subclass
767 date_attribute = None
768
769 schema = None
770
771 def validate(self):
772 if not self.date_attribute:
773 raise NotImplementedError(
774 "date_attribute must be overriden in subclass")
775 return self
776
777 def get_resource_date(self, i):
778 v = i[self.date_attribute]
779 if not isinstance(v, datetime.datetime):
780 v = parse(v)
781 if not v.tzinfo:
782 v = v.replace(tzinfo=tzutc())
783 return v
784
785 def __call__(self, i):
786 v = self.get_resource_date(i)
787 if v is None:
788 return False
789 op = OPERATORS[self.data.get('op', 'greater-than')]
790
791 if not self.threshold_date:
792
793 days = self.data.get('days', 0)
794 hours = self.data.get('hours', 0)
795 minutes = self.data.get('minutes', 0)
796 # Work around placebo issues with tz
797 if v.tzinfo:
798 n = datetime.datetime.now(tz=tzutc())
799 else:
800 n = datetime.datetime.now()
801 self.threshold_date = n - timedelta(days=days, hours=hours, minutes=minutes)
802
803 return op(self.threshold_date, v)
804
805
806class EventFilter(ValueFilter):
807 """Filter a resource based on an event."""
808
809 schema = type_schema('event', rinherit=ValueFilter.schema)
810 schema_alias = True
811
812 def validate(self):
813 if 'mode' not in self.manager.data:
814 raise PolicyValidationError(
815 "Event filters can only be used with lambda policies in %s" % (
816 self.manager.data,))
817 return self
818
819 def process(self, resources, event=None):
820 if event is None:
821 return resources
822 if self(event):
823 return resources
824 return []
825
826
827class ValueRegex:
828 """Allows filtering based on the output of a regex capture.
829 This is useful for parsing data that has a weird format.
830
831 Instead of comparing the contents of the 'resource value' with the 'value',
832 it will instead apply the regex to contents of the 'resource value', and compare
833 the result of the capture group defined in that regex with the 'value'.
834 Therefore you must have a single capture group defined in the regex.
835
836 If the regex doesn't find a match it will return 'None'
837
838 Example of getting a datetime object to make an 'expiration' comparison::
839
840 type: value
841 value_regex: ".*delete_after=([0-9]{4}-[0-9]{2}-[0-9]{2}).*"
842 key: "tag:company_mandated_metadata"
843 value_type: expiration
844 op: lte
845 value: 0
846 """
847
848 def __init__(self, expr):
849 self.expr = expr
850
851 def get_resource_value(self, resource):
852 if resource is None:
853 return resource
854 try:
855 capture = re.match(self.expr, resource)
856 except (ValueError, TypeError):
857 return None
858 if capture is None: # regex didn't capture anything
859 return None
860 return capture.group(1)
861
862
863class ReduceFilter(BaseValueFilter):
864 """Generic reduce filter to group, sort, and limit your resources.
865
866 This example will select the longest running instance from each ASG,
867 then randomly choose 10% of those, maxing at 15 total instances.
868
869 :example:
870
871 .. code-block:: yaml
872
873 - name: oldest-instance-by-asg
874 resource: ec2
875 filters:
876 - "tag:aws:autoscaling:groupName": present
877 - type: reduce
878 group-by: "tag:aws:autoscaling:groupName"
879 sort-by: "LaunchTime"
880 order: asc
881 limit: 1
882
883 Or you might want to randomly select a 10 percent of your resources,
884 but no more than 15.
885
886 :example:
887
888 .. code-block:: yaml
889
890 - name: random-selection
891 resource: ec2
892 filters:
893 - type: reduce
894 order: randomize
895 limit: 15
896 limit-percent: 10
897
898 """
899 annotate = False
900
901 schema = {
902 'type': 'object',
903 # Doesn't mix well with inherits that extend
904 'additionalProperties': False,
905 'required': ['type'],
906 'properties': {
907 # Doesn't mix well as enum with inherits that extend
908 'type': {'enum': ['reduce']},
909 'group-by': {
910 'oneOf': [
911 {'type': 'string'},
912 {
913 'type': 'object',
914 'key': {'type': 'string'},
915 'value_type': {'enum': ['string', 'number', 'date']},
916 'value_regex': 'string',
917 },
918 ]
919 },
920 'sort-by': {
921 'oneOf': [
922 {'type': 'string'},
923 {
924 'type': 'object',
925 'key': {'type': 'string'},
926 'value_type': {'enum': ['string', 'number', 'date']},
927 'value_regex': 'string',
928 },
929 ]
930 },
931 'order': {'enum': ['asc', 'desc', 'reverse', 'randomize']},
932 'null-order': {'enum': ['first', 'last']},
933 'limit': {'type': 'number', 'minimum': 0},
934 'limit-percent': {'type': 'number', 'minimum': 0, 'maximum': 100},
935 'discard': {'type': 'number', 'minimum': 0},
936 'discard-percent': {'type': 'number', 'minimum': 0, 'maximum': 100},
937 },
938 }
939 schema_alias = True
940
941 def __init__(self, data, manager):
942 super(ReduceFilter, self).__init__(data, manager)
943 self.order = self.data.get('order', 'asc')
944 self.group_by = self.get_sort_config('group-by')
945 self.sort_by = self.get_sort_config('sort-by')
946
947 def validate(self):
948 # make sure the regexes compile
949 if 'value_regex' in self.group_by:
950 self._validate_value_regex(self.group_by['value_regex'])
951 if 'value_regex' in self.sort_by:
952 self._validate_value_regex(self.sort_by['value_regex'])
953 return self
954
955 def process(self, resources, event=None):
956 groups = self.group(resources)
957
958 # specified either of the sorting options, so sort
959 if 'sort-by' in self.data or 'order' in self.data:
960 groups = self.sort_groups(groups)
961
962 # now apply any limits to the groups and concatenate
963 return list(filter(None, self.limit(groups)))
964
965 def group(self, resources):
966 groups = {}
967 for r in resources:
968 v = self._value_to_sort(self.group_by, r)
969 vstr = str(v)
970 if vstr not in groups:
971 groups[vstr] = {'sortkey': v, 'resources': []}
972 groups[vstr]['resources'].append(r)
973 return groups
974
975 def get_sort_config(self, key):
976 # allow `foo: bar` but convert to
977 # `foo: {'key': bar}`
978 d = self.data.get(key, {})
979 if isinstance(d, str):
980 d = {'key': d}
981 d['null_sort_value'] = self.null_sort_value(d)
982 return d
983
984 def sort_groups(self, groups):
985 for g in groups:
986 groups[g]['resources'] = self.reorder(
987 groups[g]['resources'],
988 key=lambda r: self._value_to_sort(self.sort_by, r),
989 )
990 return groups
991
992 def _value_to_sort(self, config, r):
993 expr = config.get('key')
994 vtype = config.get('value_type', 'string')
995 vregex = config.get('value_regex')
996 v = None
997
998 try:
999 # extract value based on jmespath
1000 if expr:
1001 v = self.get_resource_value(expr, r, vregex)
1002
1003 if v is not None:
1004 # now convert to expected type
1005 if vtype == 'number':
1006 v = float(v)
1007 elif vtype == 'date':
1008 v = parse_date(v)
1009 else:
1010 v = str(v)
1011 except (AttributeError, ValueError):
1012 v = None
1013
1014 if v is None:
1015 v = config.get('null_sort_value')
1016 return v
1017
1018 def null_sort_value(self, config):
1019 vtype = config.get('value_type', 'string')
1020 placement = self.data.get('null-order', 'last')
1021
1022 if (placement == 'last' and self.order == 'desc') or (
1023 placement != 'last' and self.order != 'desc'
1024 ):
1025 # return a value that will sort first
1026 if vtype == 'number':
1027 return float('-inf')
1028 elif vtype == 'date':
1029 return datetime.datetime.min.replace(tzinfo=tzutc())
1030 return ''
1031 else:
1032 # return a value that will sort last
1033 if vtype == 'number':
1034 return float('inf')
1035 elif vtype == 'date':
1036 return datetime.datetime.max.replace(tzinfo=tzutc())
1037 return '\uffff'
1038
1039 def limit(self, groups):
1040 results = []
1041
1042 max = self.data.get('limit', 0)
1043 pct = self.data.get('limit-percent', 0)
1044 drop = self.data.get('discard', 0)
1045 droppct = self.data.get('discard-percent', 0)
1046 ordered = list(groups)
1047 if 'group-by' in self.data or 'order' in self.data:
1048 ordered = self.reorder(ordered, key=lambda r: groups[r]['sortkey'])
1049 for g in ordered:
1050 # discard X first
1051 if droppct > 0:
1052 n = int(droppct / 100 * len(groups[g]['resources']))
1053 if n > drop:
1054 drop = n
1055 if drop > 0:
1056 groups[g]['resources'] = groups[g]['resources'][drop:]
1057
1058 # then limit the remaining
1059 count = len(groups[g]['resources'])
1060 if pct > 0:
1061 count = int(pct / 100 * len(groups[g]['resources']))
1062 if max > 0 and max < count:
1063 count = max
1064 results.extend(groups[g]['resources'][0:count])
1065 return results
1066
1067 def reorder(self, items, key=None):
1068 if self.order == 'randomize':
1069 return sample(items, k=len(items))
1070 elif self.order == 'reverse':
1071 return items[::-1]
1072 else:
1073 return sorted(items, key=key, reverse=(self.order == 'desc'))
1074
1075
1076class ListItemModel:
1077 id = 'c7n:_id'
1078
1079
1080class ListItemRegistry(FilterRegistry):
1081
1082 def __init__(self, *args, **kw):
1083 super(FilterRegistry, self).__init__(*args, **kw)
1084 self.register('value', ValueFilter)
1085 self.register('or', Or)
1086 self.register('and', And)
1087 self.register('not', Not)
1088 self.register('reduce', ReduceFilter)
1089
1090
1091class ListItemResourceManager(ResourceManager):
1092 filter_registry = ListItemRegistry('filters')
1093
1094 def get_model(self):
1095 return ListItemModel
1096
1097
1098class ListItemFilter(Filter):
1099 """
1100 Perform multi attribute filtering on items within a list,
1101 for example looking for security groups that have rules which
1102 include 0.0.0.0/0 and port 22 open.
1103
1104 :example:
1105
1106 .. code-block:: yaml
1107
1108 policies:
1109 - name: security-group-with-22-open-to-world
1110 resource: aws.security-group
1111 filters:
1112 - type: list-item
1113 key: IpPermissions
1114 attrs:
1115 - type: value
1116 key: IpRanges[].CidrIp
1117 value: '0.0.0.0/0'
1118 op: in
1119 value_type: swap
1120 - type: value
1121 key: FromPort
1122 value: 22
1123 - type: value
1124 key: ToPort
1125 value: 22
1126 - name: find-task-def-not-using-registry
1127 resource: aws.ecs-task-definition
1128 filters:
1129 - not:
1130 - type: list-item
1131 key: containerDefinitions
1132 attrs:
1133 - not:
1134 - type: value
1135 key: image
1136 value: "${account_id}.dkr.ecr.us-east-2.amazonaws.com.*"
1137 op: regex
1138 """
1139
1140 schema = type_schema(
1141 'list-item',
1142 **{
1143 'key': {'type': 'string'},
1144 'attrs': {'$ref': '#/definitions/filters_common/list_item_attrs'},
1145 'count': {'type': 'number'},
1146 'count_op': {'$ref': '#/definitions/filters_common/comparison_operators'},
1147 },
1148 )
1149
1150 schema_alias = True
1151 annotate_items = False
1152 item_annotation_key = "c7n:ListItemMatches"
1153 _expr = None
1154
1155 @property
1156 def expr(self):
1157 if self._expr:
1158 return self._expr
1159 self._expr = jmespath_compile(self.data['key'])
1160 return self._expr
1161
1162 def check_count(self, rcount):
1163 if 'count' not in self.data:
1164 return False
1165 count = self.data['count']
1166 op = OPERATORS[self.data.get('count_op', 'eq')]
1167 if op(rcount, count):
1168 return True
1169
1170 def process(self, resources, event=None):
1171 result = []
1172 frm = ListItemResourceManager(
1173 self.manager.ctx, data={'filters': self.data.get('attrs', [])})
1174 for r in resources:
1175 list_values = self.get_item_values(r)
1176 if not list_values:
1177 if self.check_count(0):
1178 result.append(r)
1179 continue
1180 if not isinstance(list_values, list):
1181 item_type = type(list_values)
1182 raise PolicyExecutionError(
1183 f"list-item filter value for {self.data['key']} is a {item_type} not a list"
1184 )
1185 for idx, list_value in enumerate(list_values):
1186 list_value['c7n:_id'] = idx
1187 list_resources = frm.filter_resources(list_values, event)
1188 matched_indicies = [r['c7n:_id'] for r in list_resources]
1189 for idx, list_value in enumerate(list_values):
1190 list_value.pop('c7n:_id')
1191 if 'count' in self.data:
1192 if self.check_count(len(list_resources)):
1193 result.append(r)
1194 elif list_resources:
1195 if not self.annotate_items:
1196 annotations = [
1197 f'{self.data.get("key", self.type)}[{str(i)}]'
1198 for i in matched_indicies
1199 ]
1200 else:
1201 annotations = list_resources
1202 r.setdefault(self.item_annotation_key, [])
1203 r[self.item_annotation_key].extend(annotations)
1204 result.append(r)
1205 return result
1206
1207 def get_item_values(self, resource):
1208 return self.expr.search(resource)
1209
1210 def __call__(self, resource):
1211 if self.process((resource,)):
1212 return True
1213 return False