addons: Reduce memory consumption (#2395)

* addons: Reduce memory consumption

Parse dump files incrementaly using ElementTree.iterparse. Clean unused
resources during parsing.  This method is explained in following
article: https://www.ibm.com/developerworks/xml/library/x-hiperfparse/

Memory consumption was reduced about 30% (measured with mprof),
execution time increased about 5% (measured with time utility).
More description available in PR.

* Switch to lxml and update iterparse routines

Use lxml module instead default xml.etree. Lxml provides convenient
wrappers around iterparse method that accepts `tag` argument. That
easer incremental parsing routines to select specific tags from roottree
like `dump` and `dumps`.

Element.clear() method was replaced by `lxml_clean` because lxml
keeps additional information to nodes that should be removed.

Added note about large consumption RAM on large dump files.
This commit doesn't solve this problem completely, but provides a way
to improve current parser to add incremental Configuration serialization
later.

* Working on iterative parser

* Added iterative Configurations parser

* fix

* Fix varlist iteration

* make sure that standards node was loaded
This commit is contained in:
Georgy Komarov 2019-12-27 10:50:56 +03:00 committed by Daniel Marjamäki
parent ec4668353d
commit d977761e76
9 changed files with 267 additions and 183 deletions

View File

@ -400,7 +400,7 @@ if __name__ == '__main__':
if not args.quiet:
print('Checking %s...' % dumpfile)
data = cppcheckdata.parsedump(dumpfile)
data = cppcheckdata.CppcheckData(dumpfile)
if VERIFY:
VERIFY_ACTUAL = []
@ -411,8 +411,8 @@ if __name__ == '__main__':
if re.match(r'cert-[A-Z][A-Z][A-Z][0-9][0-9].*',word):
VERIFY_EXPECTED.append(str(tok.linenr) + ':' + word)
for cfg in data.configurations:
if (len(data.configurations) > 1) and (not args.quiet):
for cfg in data.iterconfigurations():
if not args.quiet:
print('Checking %s, config %s...' % (dumpfile, cfg.name))
exp05(cfg)
exp42(cfg)

View File

@ -6,12 +6,12 @@ This is a Python module that helps you access Cppcheck dump data.
License: No restrictions, use this as you need.
"""
from xml.etree import ElementTree
import argparse
from fnmatch import fnmatch
import json
import sys
from xml.etree import ElementTree
from fnmatch import fnmatch
class Directive:
"""
@ -407,8 +407,6 @@ class Function:
self.argument = {}
self.argumentId = {}
for arg in element:
self.argumentId[int(arg.get('nr'))] = arg.get('variable')
def __repr__(self):
attrs = ["Id", "tokenDefId", "name", "type", "isVirtual",
@ -513,6 +511,60 @@ class Variable:
self.scope = IdMap[self.scopeId]
class Value:
"""
Value class
Attributes:
intvalue integer value
tokvalue token value
floatvalue float value
containerSize container size
condition condition where this Value comes from
valueKind 'known' or 'possible'
inconclusive Is value inconclusive?
"""
intvalue = None
tokvalue = None
floatvalue = None
containerSize = None
condition = None
valueKind = None
inconclusive = False
def isKnown(self):
return self.valueKind and self.valueKind == 'known'
def isPossible(self):
return self.valueKind and self.valueKind == 'possible'
def __init__(self, element):
self.intvalue = element.get('intvalue')
if self.intvalue:
self.intvalue = int(self.intvalue)
self.tokvalue = element.get('tokvalue')
self.floatvalue = element.get('floatvalue')
self.containerSize = element.get('container-size')
self.condition = element.get('condition-line')
if self.condition:
self.condition = int(self.condition)
if element.get('known'):
self.valueKind = 'known'
elif element.get('possible'):
self.valueKind = 'possible'
if element.get('inconclusive'):
self.inconclusive = True
def __repr__(self):
attrs = ["intvalue", "tokvalue", "floatvalue", "containerSize",
"condition", "valueKind", "inconclusive"]
return "{}({})".format(
"Value",
", ".join(("{}={}".format(a, repr(getattr(self, a))) for a in attrs))
)
class ValueFlow:
"""
ValueFlow::Value class
@ -528,64 +580,9 @@ class ValueFlow:
Id = None
values = None
class Value:
"""
Value class
Attributes:
intvalue integer value
tokvalue token value
floatvalue float value
containerSize container size
condition condition where this Value comes from
valueKind 'known' or 'possible'
inconclusive Is value inconclusive?
"""
intvalue = None
tokvalue = None
floatvalue = None
containerSize = None
condition = None
valueKind = None
inconclusive = False
def isKnown(self):
return self.valueKind and self.valueKind == 'known'
def isPossible(self):
return self.valueKind and self.valueKind == 'possible'
def __init__(self, element):
self.intvalue = element.get('intvalue')
if self.intvalue:
self.intvalue = int(self.intvalue)
self.tokvalue = element.get('tokvalue')
self.floatvalue = element.get('floatvalue')
self.containerSize = element.get('container-size')
self.condition = element.get('condition-line')
if self.condition:
self.condition = int(self.condition)
if element.get('known'):
self.valueKind = 'known'
elif element.get('possible'):
self.valueKind = 'possible'
if element.get('inconclusive'):
self.inconclusive = True
def __repr__(self):
attrs = ["intvalue", "tokvalue", "floatvalue", "containerSize",
"condition", "valueKind", "inconclusive"]
return "{}({})".format(
"Value",
", ".join(("{}={}".format(a, repr(getattr(self, a))) for a in attrs))
)
def __init__(self, element):
self.Id = element.get('id')
self.values = []
for value in element:
self.values.append(ValueFlow.Value(value))
def __repr__(self):
attrs = ["Id", "values"]
@ -649,6 +646,7 @@ class Configuration:
functions List of Function items
variables List of Variable items
valueflow List of ValueFlow values
standards List of Standards values
"""
name = ''
@ -658,54 +656,28 @@ class Configuration:
functions = []
variables = []
valueflow = []
standards = []
def __init__(self, confignode):
self.name = confignode.get('cfg')
def __init__(self, name):
self.name = name
self.directives = []
self.tokenlist = []
self.scopes = []
self.functions = []
self.variables = []
self.valueflow = []
arguments = []
self.standards = []
for element in confignode:
if element.tag == "standards":
self.standards = Standards(element)
if element.tag == 'directivelist':
for directive in element:
self.directives.append(Directive(directive))
if element.tag == 'tokenlist':
for token in element:
self.tokenlist.append(Token(token))
# set next/previous..
prev = None
for token in self.tokenlist:
token.previous = prev
if prev:
prev.next = token
prev = token
if element.tag == 'scopes':
for scope in element:
self.scopes.append(Scope(scope))
for functionList in scope:
if functionList.tag == 'functionList':
for function in functionList:
self.functions.append(Function(function))
if element.tag == 'variables':
for variable in element:
var = Variable(variable)
if var.nameTokenId:
self.variables.append(var)
else:
arguments.append(var)
if element.tag == 'valueflow':
for values in element:
self.valueflow.append(ValueFlow(values))
def set_tokens_links(self):
"""Set next/previous links between tokens."""
prev = None
for token in self.tokenlist:
token.previous = prev
if prev:
prev.next = token
prev = token
def set_id_map(self, arguments):
IdMap = {None: None, '0': None, '00000000': None, '0000000000000000': None}
for token in self.tokenlist:
IdMap[token.Id] = token
@ -719,7 +691,6 @@ class Configuration:
IdMap[variable.Id] = variable
for values in self.valueflow:
IdMap[values.Id] = values.values
for token in self.tokenlist:
token.setId(IdMap)
for scope in self.scopes:
@ -731,12 +702,12 @@ class Configuration:
for variable in arguments:
variable.setId(IdMap)
def __repr__(self):
attrs = ["name"]
return "{}({})".format(
"Configuration",
", ".join(("{}={}".format(a, repr(getattr(self, a))) for a in attrs))
)
def setIdMap(self, functions_arguments):
"""Set relationships between objects stored in this configuration.
:param functions_arguments: List of Variable objects which are function arguments
"""
self.set_tokens_links()
self.set_id_map(functions_arguments)
class Platform:
@ -810,7 +781,9 @@ class CppcheckData:
Contains a list of Configuration instances
Attributes:
configurations List of Configurations
filename Path to Cppcheck dump file
rawTokens List of rawToken elements
suppressions List of Suppressions
To iterate through all configurations use such code:
@code
@ -842,42 +815,156 @@ class CppcheckData:
rawTokens = []
platform = None
configurations = []
suppressions = []
def __init__(self, filename):
self.configurations = []
"""
:param filename: Path to Cppcheck dump file
"""
self.filename = filename
data = ElementTree.parse(filename)
files = [] # source files for elements occurred in this configuration
platform_done = False
rawtokens_done = False
suppressions_done = False
for platformNode in data.getroot():
if platformNode.tag == 'platform':
self.platform = Platform(platformNode)
# Parse general configuration options from <dumps> node
# We intentionally don't clean node resources here because we
# want to serialize in memory only small part of the XML tree.
for event, node in ElementTree.iterparse(self.filename, events=('start', 'end')):
if platform_done and rawtokens_done and suppressions_done:
break
if node.tag == 'platform' and event == 'start':
self.platform = Platform(node)
platform_done = True
elif node.tag == 'rawtokens' and event == 'end':
for rawtokens_node in node:
if rawtokens_node.tag == 'file':
files.append(rawtokens_node.get('name'))
elif rawtokens_node.tag == 'tok':
tok = Token(rawtokens_node)
tok.file = files[int(rawtokens_node.get('fileIndex'))]
self.rawTokens.append(tok)
rawtokens_done = True
elif node.tag == 'suppressions' and event == 'end':
for suppressions_node in node:
self.suppressions.append(Suppression(suppressions_node))
suppressions_done = True
for rawTokensNode in data.getroot():
if rawTokensNode.tag != 'rawtokens':
# Set links between rawTokens.
for i in range(len(self.rawTokens)-1):
self.rawTokens[i+1].previous = self.rawTokens[i]
self.rawTokens[i].next = self.rawTokens[i+1]
@property
def configurations(self):
"""
Return the list of all available Configuration objects.
"""
return list(self.iterconfigurations())
def iterconfigurations(self):
"""
Create and return iterator for the available Configuration objects.
The iterator loops over all Configurations in the dump file tree, in document order.
"""
cfg = None
cfg_arguments = [] # function arguments for Configuration node initialization
cfg_function = None
cfg_valueflow = None
# Scopes contains <varlist> with all occurred variables. Some of them
# appearaed in <variables> node for this configuration.
# Others are arguments of functions.
# They have similar tag <var> but doesn't contain any attributes. So we
# set set a special state when iterate <varlist> node to prevent
# overriding of cfg.variables list with empty values.
iter_varlist = False
# Use iterable objects to traverse XML tree for dump files incrementally.
# Iterative approach is required to avoid large memory consumption.
# Calling .clear() is necessary to let the element be garbage collected.
for event, node in ElementTree.iterparse(self.filename, events=('start', 'end')):
# Serialize new configuration node
if node.tag == 'dump':
if event == 'start':
cfg = Configuration(node.get('cfg'))
continue
elif event == 'end':
cfg.setIdMap(cfg_arguments)
yield cfg
cfg = None
cfg_arguments = []
# Parse nested elemenets of configuration node
elif node.tag == "standards" and event == 'start':
continue
files = []
for node in rawTokensNode:
if node.tag == 'file':
files.append(node.get('name'))
elif node.tag == 'tok':
tok = Token(node)
tok.file = files[int(node.get('fileIndex'))]
self.rawTokens.append(tok)
for i in range(len(self.rawTokens) - 1):
self.rawTokens[i + 1].previous = self.rawTokens[i]
self.rawTokens[i].next = self.rawTokens[i + 1]
elif node.tag == "standards" and event == 'end':
cfg.standards = Standards(node)
for suppressionsNode in data.getroot():
if suppressionsNode.tag == "suppressions":
for suppression in suppressionsNode:
self.suppressions.append(Suppression(suppression))
# Parse directives list
elif node.tag == 'directive' and event == 'start':
cfg.directives.append(Directive(node))
# root is 'dumps' node, each config has its own 'dump' subnode.
for cfgnode in data.getroot():
if cfgnode.tag == 'dump':
self.configurations.append(Configuration(cfgnode))
# Parse tokens
elif node.tag == 'tokenlist' and event == 'start':
continue
elif node.tag == 'token' and event == 'start':
cfg.tokenlist.append(Token(node))
# Parse scopes
elif node.tag == 'scopes' and event == 'start':
continue
elif node.tag == 'scope' and event == 'start':
cfg.scopes.append(Scope(node))
elif node.tag == 'varlist':
if event == 'start':
iter_varlist = True
elif event == 'end':
iter_varlist = False
# Parse functions
elif node.tag == 'functionList' and event == 'start':
continue
elif node.tag == 'function':
if event == 'start':
cfg_function = Function(node)
continue
elif event == 'end':
cfg.functions.append(cfg_function)
cfg_function = None
# Parse function arguments
elif node.tag == 'arg' and event == 'start':
arg_nr = int(node.get('nr'))
arg_variable_id = node.get('variable')
cfg_function.argumentId[arg_nr] = arg_variable_id
# Parse variables
elif node.tag == 'var' and event == 'start':
var = Variable(node)
if var.nameTokenId:
cfg.variables.append(var)
elif not iter_varlist:
cfg_arguments.append(var)
# Parse valueflows (list of values)
elif node.tag == 'valueflow' and event == 'start':
continue
elif node.tag == 'values':
if event == 'start':
cfg_valueflow = ValueFlow(node)
continue
elif event == 'end':
cfg.valueflow.append(cfg_valueflow)
cfg_valueflow = None
# Parse values
elif node.tag == 'value' and event == 'start':
cfg_valueflow.values.append(Value(node))
# Remove links to the sibling nodes
node.clear()
def __repr__(self):
attrs = ["configurations", "platform"]

View File

@ -10,12 +10,11 @@ for arg in sys.argv[1:]:
if arg.startswith('-'):
continue
print('Checking ' + arg + '...')
data = cppcheckdata.parsedump(arg)
print('Checking %s...' % arg)
data = cppcheckdata.CppcheckData(arg)
for cfg in data.configurations:
if len(data.configurations) > 1:
print('Checking ' + arg + ', config "' + cfg.name + '"...')
for cfg in data.iterconfigurations():
print('Checking %s, config %s...' % (arg, cfg.name))
for token in cfg.tokenlist:
if token.str != '(' or not token.astOperand1 or token.astOperand2:
continue

View File

@ -49,10 +49,10 @@ def isStringLiteral(tokenString):
return tokenString.startswith('"')
# check data
def stringConcatInArrayInit(configurations, rawTokens):
def stringConcatInArrayInit(data):
# Get all string macros
stringMacros = []
for cfg in configurations:
for cfg in data.iterconfigurations():
for directive in cfg.directives:
res = re.match(r'#define[ ]+([A-Za-z0-9_]+)[ ]+".*', directive.str)
if res:
@ -62,12 +62,12 @@ def stringConcatInArrayInit(configurations, rawTokens):
# Check code
arrayInit = False
for i in range(len(rawTokens)):
for i in range(len(data.rawTokens)):
if i < 2:
continue
tok1 = rawTokens[i-2].str
tok2 = rawTokens[i-1].str
tok3 = rawTokens[i-0].str
tok1 = data.rawTokens[i-2].str
tok2 = data.rawTokens[i-1].str
tok3 = data.rawTokens[i-0].str
if tok3 == '}':
arrayInit = False
elif tok1 == ']' and tok2 == '=' and tok3 == '{':
@ -76,11 +76,11 @@ def stringConcatInArrayInit(configurations, rawTokens):
isString2 = (isStringLiteral(tok2) or (tok2 in stringMacros))
isString3 = (isStringLiteral(tok3) or (tok3 in stringMacros))
if isString2 and isString3:
reportError(rawTokens[i], 'style', 'String concatenation in array initialization, missing comma?', 'stringConcatInArrayInit')
reportError(data.rawTokens[i], 'style', 'String concatenation in array initialization, missing comma?', 'stringConcatInArrayInit')
def implicitlyVirtual(data):
for cfg in data.configurations:
for cfg in data.iterconfigurations():
for function in cfg.functions:
if function.isImplicitlyVirtual is None:
continue
@ -89,7 +89,7 @@ def implicitlyVirtual(data):
reportError(function.tokenDef, 'style', 'Function \'' + function.name + '\' overrides base class function but is not marked with \'virtual\' keyword.', 'implicitlyVirtual')
def ellipsisStructArg(data):
for cfg in data.configurations:
for cfg in data.iterconfigurations():
for tok in cfg.tokenlist:
if tok.str != '(':
continue
@ -137,8 +137,9 @@ def ellipsisStructArg(data):
for arg in sys.argv[1:]:
if arg in ['-debug', '-verify', '--cli']:
continue
print('Checking ' + arg + '...')
data = cppcheckdata.parsedump(arg)
print("Checking %s..." % arg)
data = cppcheckdata.CppcheckData(arg)
if VERIFY:
VERIFY_ACTUAL = []
@ -149,7 +150,7 @@ for arg in sys.argv[1:]:
if word in ['stringConcatInArrayInit', 'implicitlyVirtual', 'ellipsisStructArg']:
VERIFY_EXPECTED.append(str(tok.linenr) + ':' + word)
stringConcatInArrayInit(data.configurations, data.rawTokens)
stringConcatInArrayInit(data)
implicitlyVirtual(data)
ellipsisStructArg(data)

View File

@ -2593,15 +2593,13 @@ class MisraChecker:
else:
self.printStatus('Checking ' + dumpfile + '...')
cfgNumber = 0
for cfg in data.configurations:
cfgNumber = cfgNumber + 1
if len(data.configurations) > 1:
self.printStatus('Checking ' + dumpfile + ', config "' + cfg.name + '"...')
for cfgNumber, cfg in enumerate(data.iterconfigurations()):
if not self.settings.quiet:
self.printStatus('Checking %s, config %s...' % (dumpfile, cfg.name))
self.executeCheck(207, self.misra_2_7, cfg)
if cfgNumber == 1:
# data.rawTokens is same for all configurations
if cfgNumber == 0:
self.executeCheck(301, self.misra_3_1, data.rawTokens)
self.executeCheck(302, self.misra_3_2, data.rawTokens)
self.executeCheck(401, self.misra_4_1, data.rawTokens)
@ -2612,12 +2610,12 @@ class MisraChecker:
self.executeCheck(505, self.misra_5_5, cfg)
# 6.1 require updates in Cppcheck (type info for bitfields are lost)
# 6.2 require updates in Cppcheck (type info for bitfields are lost)
if cfgNumber == 1:
if cfgNumber == 0:
self.executeCheck(701, self.misra_7_1, data.rawTokens)
self.executeCheck(703, self.misra_7_3, data.rawTokens)
self.executeCheck(811, self.misra_8_11, cfg)
self.executeCheck(812, self.misra_8_12, cfg)
if cfgNumber == 1:
if cfgNumber == 0:
self.executeCheck(814, self.misra_8_14, data.rawTokens)
self.executeCheck(905, self.misra_9_5, data.rawTokens)
self.executeCheck(1001, self.misra_10_1, cfg)
@ -2631,7 +2629,7 @@ class MisraChecker:
self.executeCheck(1107, self.misra_11_7, cfg)
self.executeCheck(1108, self.misra_11_8, cfg)
self.executeCheck(1109, self.misra_11_9, cfg)
if cfgNumber == 1:
if cfgNumber == 0:
self.executeCheck(1201, self.misra_12_1_sizeof, data.rawTokens)
self.executeCheck(1201, self.misra_12_1, cfg)
self.executeCheck(1202, self.misra_12_2, cfg)
@ -2649,11 +2647,11 @@ class MisraChecker:
self.executeCheck(1502, self.misra_15_2, cfg)
self.executeCheck(1503, self.misra_15_3, cfg)
self.executeCheck(1505, self.misra_15_5, cfg)
if cfgNumber == 1:
if cfgNumber == 0:
self.executeCheck(1506, self.misra_15_6, data.rawTokens)
self.executeCheck(1507, self.misra_15_7, cfg)
self.executeCheck(1602, self.misra_16_2, cfg)
if cfgNumber == 1:
if cfgNumber == 0:
self.executeCheck(1603, self.misra_16_3, data.rawTokens)
self.executeCheck(1604, self.misra_16_4, cfg)
self.executeCheck(1605, self.misra_16_5, cfg)
@ -2661,7 +2659,7 @@ class MisraChecker:
self.executeCheck(1607, self.misra_16_7, cfg)
self.executeCheck(1701, self.misra_17_1, cfg)
self.executeCheck(1702, self.misra_17_2, cfg)
if cfgNumber == 1:
if cfgNumber == 0:
self.executeCheck(1706, self.misra_17_6, data.rawTokens)
self.executeCheck(1707, self.misra_17_7, cfg)
self.executeCheck(1708, self.misra_17_8, cfg)
@ -2672,7 +2670,7 @@ class MisraChecker:
self.executeCheck(1902, self.misra_19_2, cfg)
self.executeCheck(2001, self.misra_20_1, cfg)
self.executeCheck(2002, self.misra_20_2, cfg)
if cfgNumber == 1:
if cfgNumber == 0:
self.executeCheck(2003, self.misra_20_3, data.rawTokens)
self.executeCheck(2004, self.misra_20_4, cfg)
self.executeCheck(2005, self.misra_20_5, cfg)

View File

@ -47,10 +47,10 @@ for arg in sys.argv[1:]:
if not arg.endswith('.dump'):
continue
print('Checking ' + arg + '...')
data = cppcheckdata.parsedump(arg)
for cfg in data.configurations:
if len(data.configurations) > 1:
print('Checking ' + arg + ', config "' + cfg.name + '"...')
data = cppcheckdata.CppcheckData(arg)
for cfg in data.iterconfigurations():
print('Checking %s, config %s...' % (arg, cfg.name))
if RE_VARNAME:
for var in cfg.variables:
if var.access == 'Private':
@ -87,4 +87,3 @@ for arg in sys.argv[1:]:
if not res:
reportError(
scope.bodyStart, 'style', 'Function ' + scope.className + ' violates naming convention', 'functionName')

View File

@ -92,7 +92,7 @@ def process(dumpfiles, configfile, debugprint=False):
if not afile[-5:] == '.dump':
continue
print('Checking ' + afile + '...')
data = cppcheckdata.parsedump(afile)
data = cppcheckdata.CppcheckData(afile)
# Check File naming
if "RE_FILE" in conf and conf["RE_FILE"]:
@ -111,8 +111,7 @@ def process(dumpfiles, configfile, debugprint=False):
evalExpr(conf["RE_NAMESPACE"], exp, mockToken, msgType, errors)
for cfg in data.configurations:
if len(data.configurations) > 1:
print('Checking ' + afile + ', config "' + cfg.name + '"...')
print('Checking %s, config %s...' % (afile, cfg.name))
if "RE_VARNAME" in conf and conf["RE_VARNAME"]:
for var in cfg.variables:
if var.nameToken and var.access != 'Global' and var.access != 'Public' and var.access != 'Private':

View File

@ -27,9 +27,10 @@ def checkstatic(data):
for arg in sys.argv[1:]:
if arg.startswith('-'):
continue
print('Checking ' + arg + '...')
data = cppcheckdata.parsedump(arg)
for cfg in data.configurations:
if len(data.configurations) > 1:
print('Checking ' + arg + ', config "' + cfg.name + '"...')
print('Checking %s...' % arg)
data = cppcheckdata.CppcheckData(arg)
for cfg in data.iterconfigurations():
print('Checking %s, config %s...' % (arg, cfg.name))
checkstatic(cfg)

View File

@ -154,7 +154,7 @@ def check_y2038_safe(dumpfile, quiet=False):
# Assume that the code is Y2038 safe until proven otherwise
y2038safe = True
# load XML from .dump file
data = cppcheckdata.parsedump(dumpfile)
data = cppcheckdata.CppcheckData(dumpfile)
# Convert dump file path to source file in format generated by cppcheck.
# For example after the following call:
@ -165,9 +165,9 @@ def check_y2038_safe(dumpfile, quiet=False):
srcfile = os.path.normpath(srcfile)
# go through each configuration
for cfg in data.configurations:
for cfg in data.iterconfigurations():
if not quiet:
print('Checking ' + srcfile + ', config "' + cfg.name + '"...')
print('Checking %s, config %s...' % (srcfile, cfg.name))
safe_ranges = []
safe = -1
time_bits_defined = False