Support unified diff patch files, skip dot-dirs

git-svn-id: svn+ssh://svn.code.sf.net/p/flawfinder/code/trunk@2 5c01084b-1f27-0410-9f85-80411afe95dc
2007-01-16 02:53:03 +00:00 · 2007-01-16 02:53:03 +00:00 · f5e94b32ec
parent 14c90f7335
commit f5e94b32ec
7 changed files with 236 additions and 40 deletions
--- a/17
+++ b/17
@ -1,3 +1,20 @@
+2007-01-15 David A. Wheeler <dwheeler, at, dwheeler.com>
+	* Modified Sebastien Tandel's code so that it also supports GNU diff
+	  (his code worked only for svn diff)
+	* When using a patchfile, skip analysis of any file not
+	  listed in the patchfile.
+
+2007-01-15 Sebastien Tandel <sebastien, at, tandel (doht) be)
+	* Add support for using "svn diff" created patch files, based
+	  on the approach described by David A. Wheeler on how it
+	  could be done.
+
+2007-01-15 David A. Wheeler <dwheeler, at, dwheeler.com>
+	* By default, now skips directories beginning with "."
+	  (this makes it work nicely with many SCM systems).
+	  Added "--followdotdir" option if you WANT it to enter
+	  such directories.
+
 2004-06-15 David A. Wheeler <dwheeler, at, dwheeler.com>
 	* Released version 1.26.
 	* NOTE: Due to an error on my part,
--- a/249
+++ b/249
@ -6,7 +6,7 @@

 See the man page for a description of the options."""

-version="1.26"
+version="1.27"

 # The default output is as follows:
 # filename:line_number [risk_level] (type) function_name: message
@ -18,8 +18,8 @@ version="1.26"
 # Note: this code is designed to run under both Python 1.5 and 2.
 # Thus, it avoids constructs not in Python 1.5 such as "+="
 # and "print >> stderr".
-
-# Copyright (C) 2001-2004 David A. Wheeler
+#
+# Copyright (C) 2001-2007 David A. Wheeler
 # This is released under the General Public License (GPL):
 #
 #    This program is free software; you can redistribute it and/or modify
@ -52,10 +52,14 @@ show_immediately = 0
 show_inputs = 0          # Only show inputs?
 falsepositive = 0        # Work to remove false positives?
 allowlink = 0            # Allow symbolic links?
+skipdotdir = 1           # If 1, don't recurse into dirs beginning with "."
+                         # Note: This doesn't affect the command line.
 num_links_skipped = 0    # Number of links skipped.
+num_dotdirs_skipped = 0  # Number of dotdirs skipped.
 show_columns = 0
 never_ignore = 0         # If true, NEVER ignore problems, even if directed.
 list_rules = 0           # If true, list the rules (helpful for debugging)
+patch_file = ""          # File containing (unified) diff output.
 loadhitlist = None
 savehitlist = None
 diffhitlist = None
@ -89,6 +93,138 @@ starttime = time.time()  # Used to determine analyzed lines/second.
 line_beginning = re.compile( r'(?m)^' )
 blank_line     = re.compile( r'(?m)^\s+$' )

+
+# The following code accepts unified diff format from both subversion (svn)
+# and GNU diff, which aren't well-documented.  It gets filenames from
+# "Index:" if exists, else from the "+++ FILENAME ..." entry.
+# Note that this is different than some tools (which will use "+++" in
+# preference to "Index:"), but subversion's nonstandard format is easier
+# to handle this way.
+# Since they aren't well-documented, here's some info on the diff formats:
+# GNU diff format:
+#    --- OLDFILENAME OLDTIMESTAMP
+#    +++ NEWFILENAME NEWTIMESTAMP
+#    @@ -OLDSTART,OLDLENGTH +NEWSTART,NEWLENGTH @@
+#    ... Changes where preceeding "+" is add, "-" is remove, " " is unchanged.
+#
+#    ",OLDLENGTH" and ",NEWLENGTH" are optional  (they default to 1).
+#    GNU unified diff format doesn't normally output "Index:"; you use
+#    the "+++/---" to find them (presuming the diff user hasn't used --label
+#    to mess it up).
+#
+# Subversion format:
+#    Index: FILENAME
+#    --- OLDFILENAME (comment)
+#    +++ NEWFILENAME (comment)
+#    @@ -OLDSTART,OLDLENGTH +NEWSTART,NEWLENGTH @@
+#
+#    In subversion, the "Index:" always occurs, and note that paren'ed
+#    comments are in the oldfilename/newfilename, NOT timestamps like
+#    everyone else.
+#
+# Single Unix Spec version 3 (http://www.unix.org/single_unix_specification/)
+# does not specify unified format at all; it only defines the older
+# (obsolete) context diff format.  That format DOES use "Index:", but
+# only when the filename isn't specified otherwise.
+# We're only supporting unified format directly; if you have an older diff
+# format, use "patch" to apply it, and then use "diff -u" to create a
+# unified format.
+# 
+diff_index_filename = re.compile( r'^Index:\s+(?P<filename>.*)' )
+diff_newfile = re.compile( r'^\+\+\+\s(?P<filename>.*)$' )
+diff_hunk = re.compile( r'^@@ -\d+(,\d+)?\s+\+(?P<linenumber>\d+)[, ].*@@$' )
+diff_line_added = re.compile( r'^\+[^+].*' )
+diff_line_del = re.compile( r'^-[^-].*' )
+# The "+++" newfile entries have the filename, followed by a timestamp
+# or " (comment)" postpended.
+# Timestamps can be of these forms:
+#   2005-04-24 14:21:39.000000000 -0400
+#   Mon Mar 10 15:13:12 1997
+# Also, "newfile" can have " (comment)" postpended.  Find and eliminate this.
+# Note that the expression below is Y10K (and Y100K) ready. :-).
+diff_findjunk = re.compile( r'^(?P<filename>.*)((\s\d\d\d\d+-\d\d-\d\d\s+\d\d:\d[0-9:.]+Z?(\s+[\-\+0-9A-Z]+)?)|(\s[A-Za-z][a-z]+\s[A-za-z][a-z]+\s\d+\s\d+:\d[0-9:.]+Z?(\s[\-\+0-9]*)?\s\d\d\d\d+)|(\s\(.*\)))\s*$')
+
+# For each file found in the file patch_file, keep the
+# line numbers of the new file (after patch is applied) which are added.
+# We keep this information in a hash table for a quick access later.
+#
+def load_patch_info(patch_file):
+  patch={}
+  line_counter= 0
+  initial_number= 0
+  index_statement = False # Set true if we see "Index:".
+  try: hPatch = open(patch_file, 'r')
+  except:
+    print "Error: failed to open", h(patch_file)
+    sys.exit(1)
+
+  patched_filename = "" # Name of new file patched by current hunk.
+
+  while True: # Loop-and-half construct.  Read a line, end loop when no more
+    sLine = hPatch.readline()
+    if (sLine == ''): break  # Done reading.
+
+    # This is really a sequence of if ... elsif ... elsif..., but
+    # because Python forbids '=' in conditions, we do it this way.
+    index_filename_match = diff_index_filename.match(sLine)
+    if (index_filename_match):
+      patched_filename = string.strip(index_filename_match.group('filename'))
+      index_statement = True
+      # Should never happen (like below):
+      if (patch.has_key(patched_filename) == True):
+        error("filename occurs more than once in the patch: %s" %
+               patched_filename)
+      else:
+        patch[patched_filename] = {}
+
+    else:
+      newfile_match = diff_newfile.match(sLine)
+      # We'll ignore the match if patched_filename already set.  This makes
+      # "Index:" takes precedence over "+++". We do this because "Index:"
+      # doesn't have junk after it that might be mistaken for part
+      # of the filename.
+      if ( (not index_statement) and newfile_match):
+        patched_filename = string.strip(newfile_match.group('filename'))
+        # Clean up filename - remove trailing timestamp and/or (comment).
+        findjunk_match = diff_findjunk.match(patched_filename)
+        if (findjunk_match):
+          patched_filename = string.strip(findjunk_match.group('filename'))
+        # Now we have the filename! Check if we've already seen it
+        # (we should not have), just like above:
+        if (patch.has_key(patched_filename)):
+  	  error("filename occurs more than once in the patch: %s" %
+                patched_filename)
+        else:
+          patch[patched_filename] = {}
+
+      else:
+        hunk_match = diff_hunk.match(sLine)
+        if (hunk_match):
+          if (patched_filename == ""):
+              error("wrong type of patch file : we have a line number without having seen a filename")
+          initial_number= hunk_match.group('linenumber')
+          line_counter= 0
+
+        else:
+          line_added_match = diff_line_added.match(sLine)
+          if (line_added_match):
+            line_added = line_counter + int(initial_number)
+            patch[patched_filename][line_added] = True
+            # Let's also warn about the lines above and below this one,
+            # so that errors that "leak" into adjacent lines are caught.
+            # Besides, if you're creating a patch, you had to at least look
+            # at adjacent lines, so you're in a position to fix them.
+            patch[patched_filename][line_added - 1] = True
+            patch[patched_filename][line_added + 1] = True
+            line_counter += 1
+
+          else:
+            line_del_match = diff_line_del.match(sLine)
+            if (line_del_match == None):
+              line_counter += 1
+  return patch
+
+
 def htmlize(s):
  # Take s, and return legal (UTF-8) HTML.
  s1 = string.replace(s,"&","&amp;")
@ -1083,7 +1219,7 @@ p_directive = re.compile( r'(?i)\s*(ITS4|Flawfinder|RATS):\s*([^\*]*)' )

 max_lookahead=500  # Lookahead limit for c_static_array.

-def process_c_file(f):
+def process_c_file(f, patch_infos):
  global filename, linenumber, ignoreline, sumlines, num_links_skipped
  global sloc
  filename=f
@ -1095,6 +1231,16 @@ def process_c_file(f):
  linebegin = 1
  codeinline = 0 # 1 when we see some code (so increment sloc at newline)

+  if ((patch_infos != None) and (not patch_infos.has_key(f))):
+    # This file isn't in the patch list, so don't bother analyzing it.
+    if not quiet:
+      if output_format:
+        print "Skipping unpatched file ", h(f), "<br>"
+      else:
+        print "Skipping unpatched file", f
+      sys.stdout.flush()
+    return
+
  if f == "-":
   input = sys.stdin
  else:
@ -1189,23 +1335,24 @@ def process_c_file(f):
            word = text[startpos:endpos]
            # print "Word is:", text[startpos:endpos]
            if c_ruleset.has_key(word) and c_valid_match(text, endpos):
-              # FOUND A MATCH, setup & call hook.
-              # print "HIT: #%s#\n" % word
-              # Don't use the tuple assignment form, e.g., a,b=c,d
-              # because Python (least 2.2.2) does that slower
-              # (presumably because it creates & destroys temporary tuples)
-              hit = Hit(c_ruleset[word])
-              hit.name = word
-              hit.start = startpos
-              hit.end = endpos
-              hit.line = linenumber
-              hit.column = find_column(text, startpos)
-              hit.filename=filename
-              hit.context_text = get_context(text, startpos)
-              hit.parameters = extract_c_parameters(text, endpos)
-              if hit.extract_lookahead:
-                hit.lookahead = text[startpos:startpos+max_lookahead]
-              apply(hit.hook, (hit, ))
+	      if ( (patch_infos == None) or ((patch_infos != None) and patch_infos[f].has_key(linenumber))):
+		# FOUND A MATCH, setup & call hook.
+		# print "HIT: #%s#\n" % word
+		# Don't use the tuple assignment form, e.g., a,b=c,d
+		# because Python (least 2.2.2) does that slower
+		# (presumably because it creates & destroys temporary tuples)
+		hit = Hit(c_ruleset[word])
+		hit.name = word
+		hit.start = startpos
+		hit.end = endpos
+		hit.line = linenumber
+		hit.column = find_column(text, startpos)
+		hit.filename=filename
+		hit.context_text = get_context(text, startpos)
+		hit.parameters = extract_c_parameters(text, endpos)
+		if hit.extract_lookahead:
+		  hit.lookahead = text[startpos:startpos+max_lookahead]
+		apply(hit.hook, (hit, ))
          elif p_digits.match(c):
            while i<len(text) and p_digits.match(text[i]): # Process a number.
              i = i + 1
@ -1283,20 +1430,24 @@ c_extensions = { '.c' : 1, '.h' : 1,
               }


-def maybe_process_file(f):
+def maybe_process_file(f, patch_infos):
  # process f, but only if (1) it's a directory (so we recurse), or
  # (2) it's source code in a language we can handle.
  # Currently, for files that means only C/C++, and we check if the filename
  # has a known C/C++ filename extension.  If it doesn't, we ignore the file.
  # We accept symlinks only if allowlink is true.
-  global num_links_skipped
+  global num_links_skipped, num_dotdirs_skipped
  if os.path.isdir(f):
    if (not allowlink) and os.path.islink(f):
      if not quiet: print "Warning: skipping symbolic link directory", h(f)
      num_links_skipped = num_links_skipped + 1
      return
+    if (skipdotdir and ("." == os.path.basename(f)[0])):
+      if not quiet: print "Warning: skipping directory with initial dot", h(f)
+      num_dotdirs_skipped = num_dotdirs_skipped + 1
+      return
    for file in os.listdir(f):
-      maybe_process_file(os.path.join(f, file))
+      maybe_process_file(os.path.join(f, file), patch_infos)
  # Now we will FIRST check if the file appears to be a C/C++ file, and
  # THEN check if it's a regular file or symlink.  This is more complicated,
  # but I do it this way so that there won't be a lot of pointless
@ -1314,10 +1465,12 @@ def maybe_process_file(f):
        # device files, etc. won't cause trouble.
        if not quiet: print "Warning: skipping non-regular file", h(f)
      else:
-        process_c_file(f)
+	# We want to know the difference only with files found in the patch.
+	if ( (patch_infos == None) or (patch_infos != None and patch_infos.has_key(f) == True) ):
+	  process_c_file(f, patch_infos)


-def process_file_args(files):
+def process_file_args(files, patch_infos):
  # Process the list of "files", some of which may be directories,
  # which were given on the command line.
  # This is handled differently than anything not found on the command line
@ -1336,12 +1489,14 @@ def process_file_args(files):
    elif os.path.isfile(f) or f == "-":
       # If on the command line, FORCE processing of it.
       # Currently, we only process C/C++.
-       process_c_file(f)
+       # check if we only want to review a patch
+       if ( (patch_infos != None and patch_infos.has_key(f) == True) or (patch_infos == None) ):
+	process_c_file(f, patch_infos)
    elif os.path.isdir(f):
       # At one time flawfinder used os.path.walk, but that Python
       # built-in doesn't give us enough control over symbolic links.
       # So, we'll walk the filesystem hierarchy ourselves:
-       maybe_process_file(f)
+       maybe_process_file(f, patch_infos)
    else:
       if not quiet: print "Warning: skipping non-regular file", h(f)

@ -1360,6 +1515,8 @@ flawfinder [--help] [--context]  [-c]  [--columns | -C] [--html]

  --allowlink
              Allow symbolic links.
+  --followdotdir
+              Follow directories whose names begin with ".".

  --context
  -c          Show context (the line having the "hit"/potential flaw)
@ -1394,6 +1551,8 @@ flawfinder [--help] [--context]  [-c]  [--columns | -C] [--html]

  --omittime  Omit time to run.

+  --patch=F   display information related to the patch F. (patch must be already applied)
+
  --Q
  --quiet     Don't display status information (i.e., which files are being
              examined) while the analysis is going on.
@ -1419,17 +1578,19 @@ flawfinder [--help] [--context]  [-c]  [--columns | -C] [--html]
 """

 def process_options():
-  global show_context, show_inputs, allowlink, omit_time
+  global show_context, show_inputs, allowlink, skipdotdir, omit_time
  global output_format, minimum_level, show_immediately, single_line
  global falsepositive
  global show_columns, never_ignore, quiet, showheading, list_rules
  global loadhitlist, savehitlist, diffhitlist
+  global patch_file
  try:
    # Note - as a side-effect, this sets sys.argv[].
-    optlist, args = getopt.getopt(sys.argv[1:], "cm:nih?CSDQIF",
+    optlist, args = getopt.getopt(sys.argv[1:], "cm:nih?CSDQIFP:",
                    ["context", "minlevel=", "immediate", "inputs", "input",
                     "nolink", "falsepositive", "falsepositives",
-                     "columns", "listrules", "omittime", "allowlink",
+                     "columns", "listrules", "omittime", "allowlink", "patch=",
+                     "followdotdir",
                     "neverignore", "quiet", "dataonly", "html", "singleline",
                     "loadhitlist=", "savehitlist=", "diffhitlist=",
                     "version", "help" ])
@ -1453,6 +1614,8 @@ def process_options():
        omit_time = 1
      elif opt == "--allowlink":
        allowlink = 1
+      elif opt == "--followdotdir":
+        skipdotdir = 0
      elif opt == "--listrules":
        list_rules = 1
      elif opt == "--html":
@ -1466,6 +1629,14 @@ def process_options():
        show_immediately = 1
      elif opt == "-n" or opt == "--neverignore":
        never_ignore = 1
+      elif opt == "-P" or opt == "--patch":
+	# Note: This is -P, so that a future -p1 option can strip away
+	# pathname prefixes (with the same option name as "patch").
+	patch_file = value
+	# If we consider ignore comments we may change a line which was
+	# previously ignored but which will raise now a valid warning without
+	# noticing it now.  So, set never_ignore.
+	never_ignore = 1
      elif opt == "--loadhitlist":
        loadhitlist = value
        display_header()
@ -1509,11 +1680,14 @@ def process_files():
    f = open(loadhitlist)
    hitlist = pickle.load(f)
  else:
+    patch_infos = None
+    if (patch_file != ""):
+      patch_infos = load_patch_info(patch_file)
    files = sys.argv[1:]
    if not files:
        print "*** No input files"
        return None
-    process_file_args(files)
+    process_file_args(files, patch_infos)
    return 1


@ -1612,9 +1786,10 @@ def show_final_results():
      print "<br>"
    else:
      print
-    print "Hits/KSLOC@level+ =",
-    for i in range(0,6):
-      print "[%d+] %3g" % (i, count_per_level_and_up[i]*1000.0/sloc),
+    if (sloc > 0):
+      print "Hits/KSLOC@level+ =",
+      for i in range(0,6):
+        print "[%d+] %3g" % (i, count_per_level_and_up[i]*1000.0/sloc),
    if output_format:
      print "<br>"
    else:
@ -1624,6 +1799,10 @@ def show_final_results():
      print "Symlinks skipped =", num_links_skipped, "(--allowlink overrides but see doc for security issue)"
      if output_format:
        print "<br>"
+    if num_dotdirs_skipped:
+      print "Dot directories skipped =", num_dotdirs_skipped, "(--followdotdir overrides)"
+      if output_format:
+        print "<br>"
    if num_ignored_hits > 0:
      print "Suppressed hits =", num_ignored_hits, "(use --neverignore to show them)"
      if output_format:
--- a/flawfinder.spec
+++ b/flawfinder.spec
@ -1,6 +1,6 @@
 Name: flawfinder
 Summary: Examines C/C++ source code for security flaws
-Version: 1.26
+Version: 1.27
 Release: 1
 License: GPL
 Group: Development/Tools
--- a/2
+++ b/2
@ -9,7 +9,7 @@
 # Eventually switch to using DistUtils to autogenerate.

 NAME=flawfinder
-VERSION=1.26
+VERSION=1.27
 RPM_VERSION=1
 VERSIONEDNAME=$(NAME)-$(VERSION)
 ARCH=noarch
--- a/setup.py
+++ b/setup.py
@ -25,7 +25,7 @@ import commands

 setup (# Distribution meta-data
       name = "flawfinder",
-       version = "1.26",
+       version = "1.27",
       description = "a program that examines source code looking for security weaknesses",
       author = "David A. Wheeler",
       author_email = "dwheeler@dwheeler.com",
--- a/test-results.html
+++ b/test-results.html
@ -9,7 +9,7 @@
 <body>
 <h1>Flawfinder Results</h1>
 Here are the security scan results from
-<a href="http://www.dwheeler.com/flawfinder">Flawfinder version 1.25</a>,
+<a href="http://www.dwheeler.com/flawfinder">Flawfinder version 1.27</a>,
 (C) 2001-2004 <a href="http://www.dwheeler.com">David A. Wheeler</a>.
 Number of dangerous functions in C/C++ ruleset: 158
 <p>
--- a/test-results.txt
+++ b/test-results.txt
@ -1,4 +1,4 @@
-Flawfinder version 1.25, (C) 2001-2004 David A. Wheeler.
+Flawfinder version 1.27, (C) 2001-2004 David A. Wheeler.
 Number of dangerous functions in C/C++ ruleset: 158
 Examining test.c
 Examining test2.c