257 @param fail_threshold
260 @param ignore_directories
263 @param project_root_dir
264 @param file_extensions
265 @param ignore_threshold
271 source_code_files = list()
272 files_to_ignore = list()
274 for directory
in directories:
275 if not os.path.isdir(directory):
276 print(
"Path does not exist or is not a directory:", directory)
277 return (ReturnCode.BAD_INPUT, {})
279 directory, file_extensions
281 for directory
in ignore_directories:
283 directory, file_extensions
287 print(
"Too few files to compare, you need to supply at least 2")
288 return (ReturnCode.BAD_INPUT, {})
289 for supplied_file
in files:
290 if not os.path.isfile(supplied_file):
291 print(
"Supplied file does not exist:", supplied_file)
292 return (ReturnCode.BAD_INPUT, {})
293 source_code_files = files
295 files_to_ignore += ignore_files
if ignore_files
else list()
296 files_to_ignore = [os.path.normpath(f)
for f
in files_to_ignore]
297 source_code_files = [os.path.normpath(f)
for f
in source_code_files]
298 source_code_files = list(set(source_code_files) - set(files_to_ignore))
299 if len(source_code_files) < 2:
300 print(
"Not enough source code files found")
301 return (ReturnCode.BAD_INPUT, {})
303 source_code_files.sort()
304 source_code_files = [os.path.abspath(f)
for f
in source_code_files]
308 if not os.path.isdir(project_root_dir):
310 "The project root directory does not exist or is not a directory:",
313 return (ReturnCode.BAD_INPUT, {})
314 project_root_dir = os.path.abspath(project_root_dir)
315 project_root_dir = os.path.join(project_root_dir,
"")
318 largest_string_length = len(
319 max(source_code_files, key=len).replace(project_root_dir,
"")
323 source_code = OrderedDict()
324 for source_code_file
in source_code_files:
327 with open(source_code_file,
"r", errors=
"surrogateescape")
as f:
330 if only_code
and source_code_file.endswith(
"py"):
332 source_code[source_code_file] = content
333 except Exception
as err:
334 print(f
"ERROR: Failed to open file {source_code_file}, reason: {str(err)}")
338 [word.lower()
for word
in word_tokenize(source_code[source_file])]
339 for source_file
in source_code
341 dictionary = gensim.corpora.Dictionary(gen_docs)
342 corpus = [dictionary.doc2bow(gen_doc)
for gen_doc
in gen_docs]
343 tf_idf = gensim.models.TfidfModel(corpus)
344 sims = gensim.similarities.Similarity(
345 tempfile.gettempdir() + os.sep, tf_idf[corpus], num_features=len(dictionary)
348 column_label = file_column_label
350 column_label += file_loc_label
351 largest_string_length += len(file_loc_label)
353 exit_code = ReturnCode.SUCCESS
354 code_similarity = dict()
355 for source_file
in source_code:
357 query_doc = [w.lower()
for w
in word_tokenize(source_code[source_file])]
358 query_doc_bow = dictionary.doc2bow(query_doc)
359 query_doc_tf_idf = tf_idf[query_doc_bow]
367 short_source_file_path = source_file.replace(project_root_dir,
"")
371 +
"Code duplication probability for "
372 + short_source_file_path
378 "-" * (largest_string_length + similarity_label_length), json_output
383 % (column_label.center(largest_string_length), similarity_column_label)
388 "-" * (largest_string_length + similarity_label_length), json_output
392 code_similarity[short_source_file_path] = dict()
394 code_similarity[short_source_file_path][loc_label] = source_file_loc
395 empty_length = len(code_similarity[short_source_file_path])
396 for similarity, source
in zip(sims[query_doc_tf_idf], source_code):
398 if source == source_file:
400 similarity_percentage = similarity * 100
402 if similarity_percentage < ignore_threshold:
404 short_source_path = source.replace(project_root_dir,
"")
406 code_similarity[short_source_file_path][short_source_path] = dict()
407 code_similarity[short_source_file_path][short_source_path][loc_label] =
get_loc_count(
410 code_similarity[short_source_file_path][short_source_path][similarity_label] = round(
411 similarity_percentage, 2
414 code_similarity[short_source_file_path][short_source_path] = round(
415 similarity_percentage, 2
417 if similarity_percentage > fail_threshold:
418 similarity_percentage = 100.00
421 if similarity_percentage < 10
423 CliColors.WARNING
if similarity_percentage < 20
else CliColors.FAIL
426 info_to_print = short_source_path
431 "%s " % (info_to_print.ljust(largest_string_length))
433 +
"%.2f" % (similarity_percentage)
438 if len(code_similarity[short_source_file_path]) == empty_length:
439 del code_similarity[short_source_file_path]
440 if exit_code == ReturnCode.THRESHOLD_EXCEEDED:
442 "Code duplication threshold exceeded. Please consult logs.", json_output
446 similarities_json = json.dumps(code_similarity, indent=4)
447 print(similarities_json)
450 with open(csv_output,
"w")
as csv_file:
451 writer = csv.writer(csv_file)
453 writer.writerow([
"File A",
"#LoC A",
"File B",
"#LoC B",
"Similarity"])
454 for first_file
in code_similarity:
455 for second_file
in code_similarity[first_file]:
456 if second_file != loc_label:
464 code_similarity[first_file][second_file][similarity_label],
468 writer.writerow([
"File A",
"File B",
"Similarity"])
469 for first_file
in code_similarity:
470 for second_file
in code_similarity[first_file]:
475 code_similarity[first_file][second_file],
479 return (exit_code, code_similarity)