#!/usr/bin/python3

# This file is part of the SV-Benchmarks collection of verification tasks:
# https://github.com/sosy-lab/sv-benchmarks
#
# SPDX-FileCopyrightText: 2011-2020 The SV-Benchmarks Community
#
# SPDX-License-Identifier: Apache-2.0

# This script iterates over all tasks and performs the following check:
# For each file with existing ".c"- and ".i"-file
# we generate the goto-cc program (i.e., intermediate representation of cbmc)
# and compare the goto-cc programs with goto-diff.
# Finding a difference implies a wrong preprocessing.

import argparse
import fnmatch
import glob
import yaml
import os
import shutil
import subprocess
import tempfile

# tasks to ignore, with reason why
TASKS_TO_IGNORE = {
  "floats-esbmc-regression/trunc_nondet_2.i": "(platform-dependent types)",
  "*pthread*/*": "(platform-dependent types)",
  "goblint-regression/*": "(platform-dependent types)",
}

# categories to be excluded ... (with reason and debug information)
CATEGORIES_TO_IGNORE = {
  "ConcurrencySafety-Main": "(platform-dependent types)",
  "ConcurrencySafety-NoOverflows": "(platform-dependent types)",
  "ConcurrencySafety-MemSafety": "(platform-dependent types)",
  "NoDataRace-Main": "(platform-dependent types)",
  "SoftwareSystems-OpenBSD-MemSafety": "(only custom includes, no system headers, complicated build process)",
  "SoftwareSystems-SQLite-MemSafety": "(complicated build process, requires patched version of cilly)",
}

# categories to be excluded, if option "skip-large" is used ... (with reason and debug information)
LARGE_CATEGORIES = {
  "SoftwareSystems-DeviceDriversLinux64-ReachSafety": "(only custom includes, no system headers, checking takes too much time)",
  "SoftwareSystems-DeviceDriversLinux64-MemSafety": "(only custom includes, no system headers, checking takes too much time)",
  "SoftwareSystems-DeviceDriversLinux64Large-ReachSafety": "(only custom includes, no system headers, checking takes too much time)",
}

# no original source available, there are only preprocessed files.
TASKS_ONLY_PREPROCESSED = [
  'ddv-machzwd/',
  'aws-c-common/',
  # for LDV: there is a related .cil.c file, but it doesn't necessarily match at all
  'ldv-linux-3.0/',
  'ldv-regression/',
  'ldv-linux-3.14-races/linux-3.14--drivers--media--platform--marvell-ccic--cafe_ccic.ko.cil-1.i',
  'loops/s3.i', # this single file is special
  'Juliet_Test/', # for Juliet there are c files, but each was preprocessed into two tasks, one valid and one invalid
  'combinations/'
]

CBMC_GIT_PATH = "../cbmc.git/"

DATA_MODELS = {"ILP32": "32", "LP64": "64"}


def fail(*msg):
  print("CRITICAL ERROR:", *msg)
  exit(1)


def expand(s):
  return sorted(glob.glob(s))


def get_tasks_from_set(setFile):
  with open(setFile, "r") as fp:
    for line in fp:
      if not line.startswith("#"): # ignore comments
        for task in expand(line.strip()):
          yield task


def build_goto_cc():
  """ build goto-cc and goto-diff if not available, and then set PATH to find it """
  if not shutil.which("goto-cc") or not shutil.which("goto-diff"):
    if not os.path.exists(CBMC_GIT_PATH + "src/goto-cc/goto-cc"):
      subprocess.check_call(["git", "clone", "--depth=1", "http://github.com/diffblue/cbmc.git", CBMC_GIT_PATH])
      subprocess.check_call(["make", "-j2", "minisat2-download"], cwd=CBMC_GIT_PATH + "src")
      subprocess.check_call(["make", "-j2", "CXX=g++-5", "goto-diff.dir", "goto-cc.dir"], cwd=CBMC_GIT_PATH + "src")
    cwd = os.getcwd()
    os.environ['PATH'] = ":".join([
        cwd + "/" + CBMC_GIT_PATH + "src/goto-cc",
        cwd + "/" + CBMC_GIT_PATH + "src/goto-diff",
        os.environ['PATH']
    ])


def execute_goto_cc(args, bits, orig, taskfile):
  """ convert both preprocessed and non-preprocessed files into goto-cc intermediate language and compare them """
  with tempfile.NamedTemporaryFile(prefix="compare_orig_", suffix=".out") as origoutfile, \
       tempfile.NamedTemporaryFile(prefix="compare_task_", suffix=".out") as taskoutfile:
    origout = origoutfile.name
    taskout = taskoutfile.name
    subprocess.check_call(["goto-cc", "-m" + bits, orig, "-o", origout])
    subprocess.check_call(["goto-cc", "-m" + bits, taskfile, "-o", taskout])
    stdout, stderr = subprocess.Popen(["goto-diff", "--verbosity", "2", "-u", origout, taskout],
                                    stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
    if len(stderr) > 0:
      print(stderr.decode('utf-8').strip())
    if len(stdout) > 0:
      if args.SHOW_DIFF:
        subprocess.call(["goto-diff", "-u", origout, taskout])
      reason_to_ignore = get_reason_if_ignored(taskfile)
      if reason_to_ignore:
        print("WARNING: Difference on", taskfile, "detected (ignored)", reason_to_ignore)
      else:
        print("ERROR: Difference on", taskfile, "detected")
        if args.KEEP_GOING:
          global EC
          EC = 1
        else:
          exit(1)


def get_setfiles(args):
  for setfileWildcard in args.setfiles:
    setfiles = expand(setfileWildcard)
    if setfiles:
      for setfile in setfiles:
        yield setfile
    else:
      fail("Could not find a matching set file for", setfileWildcard)


def get_inputfiles_from_yml(yml, taskfile):
  # check whether the input_basename exists (nobody should ever use "null" as a filename!)
  inputFiles = yml['input_files']
  if not inputFiles:
    fail("No input files defined in", taskfile)
  elif isinstance(inputFiles, list):
      return inputFiles
  else:
      return [inputFiles] # always wrap as list


def get_bits_from_yml(yml, taskfile):
  data_model = yml.get("options", {}).get("data_model")
  if not data_model:
    fail("No data model defined in", taskfile)
  if not data_model in DATA_MODELS:
    fail("Unknown data model", data_model, "defined in", taskfile)
  return DATA_MODELS[data_model]


def get_orig_filename(taskfile):
  """ try to find a matching source file (.c) for a preprocessed task """
  if taskfile.endswith('.c.i'):
    orig = taskfile[:-2] # remove ending ".i"
  if taskfile.startswith('ldv-memsafety/memleaks'):
    taskfile = taskfile.replace('memleaks', 'memleaks-notpreprocessed/memleaks')
  orig = taskfile[:-2] + ".c" # replace .i with .c (or .c with .c)

  if os.path.exists(orig):
      return orig
  else:
    fail("No original source of", taskfile, "found at", orig)


def get_reason_if_ignored(taskfile):
  for pattern, reason in TASKS_TO_IGNORE.items():
    if fnmatch.fnmatch(taskfile, pattern):
      return reason
  return None


# parse comand line options and set default values
parser = argparse.ArgumentParser()
parser.add_argument("-k", "--keep-going", dest="KEEP_GOING", action="store_true",
                    help="do not exit after error")
parser.add_argument("-v", "--diff", dest="SHOW_DIFF", action="store_true",
                    help="show the changes for the preprocessed files")
parser.add_argument("--skip-large", dest="SKIP_LARGE", action="store_true",
                    help="ignore large benchmark sets (see internal list of large categories)")
parser.add_argument(dest="setfiles", type=str, nargs='*', default=["*.set"],
                    help='set files to be analysed')
parser.add_argument("-d", "--directory", dest="directories", type=str, action='append', default=None,
                    help='directories with files to be analysed')
args = parser.parse_args()

build_goto_cc()

EC=0

if args.directories:
  print("Analyzing only files from the following directories:\n    " + "\n    ".join(args.directories))
  for directory in args.directories:
    if not os.path.isdir(directory):
      print("Directory '{}' does not exist.".format(directory))

for setfile in get_setfiles(args):
  setname = os.path.basename(setfile)[:-4] # remove ending ".set"

  # skip some sets, like LDV (too big) or Concurrency (pthread headers are very platform dependent)
  if setname in CATEGORIES_TO_IGNORE:
    print("Skipping category", setname, CATEGORIES_TO_IGNORE[setname])
    continue
  if args.SKIP_LARGE and setname in LARGE_CATEGORIES:
    print("Skipping category", setname, LARGE_CATEGORIES[setname])
    continue

  print("Processing category", setname)

  i = 0
  for taskfile in get_tasks_from_set(setfile):
    if args.directories and os.path.dirname(taskfile) not in args.directories:
      continue

    if taskfile.endswith(".yml"):
      with open(taskfile, 'r') as yamlfile:
        yml = yaml.safe_load(yamlfile)
        inputFiles = get_inputfiles_from_yml(yml, taskfile)
        bits = get_bits_from_yml(yml, taskfile)
      if len(inputFiles) == 1:
        taskfile = os.path.join(os.path.dirname(taskfile), inputFiles[0])
      else:
        print("ignoring task consisting of multiple sourcefiles", inputFiles)
        continue

    if not os.path.exists(taskfile):
      fail("No task file", taskfile, "found")

    # no original source available
    if taskfile.startswith(tuple(TASKS_ONLY_PREPROCESSED)):
      continue

    # try to find a matching source file for a preprocessed task
    orig = get_orig_filename(taskfile)
    if orig == taskfile:
      continue # taskfile was already a .c-file

    i += 1
    if i % 10 == 0:
      print("Processing file", i, "of category", setname)

    # now we have found all required files and start with the actual check:
    execute_goto_cc(args, bits, orig, taskfile)

exit(EC)