Arkanjo 0.1
A tool for find code duplicated functions in codebases
Loading...
Searching...
No Matches
duplicate_code_detection.py
Go to the documentation of this file.
1"""! @package duplicate_code_detection
2@file duplicate_code_detection.py
3@brief A simple Python3 tool to detect similarities between files within a repository.
4
5Document similarity code adapted from Jonathan Mugan's tutorial:
6https://www.oreilly.com/learning/how-do-i-compare-document-similarity-using-python
7"""
8
9import os
10import sys
11import argparse
12import gensim
13import ast
14import csv
15import astor
16import re
17import tempfile
18import json
19from enum import Enum
20from nltk.tokenize import word_tokenize
21from collections import OrderedDict
22
23source_code_file_extensions = ["h", "c", "cpp", "cc", "java", "py", "cs"]
24file_column_label = "File"
25file_loc_label = ",#LoC"
26similarity_column_label = "Similarity (%)"
27similarity_label_length = len(similarity_column_label)
28loc_label = "#LoC"
29similarity_label = "Similarity"
30
31class ReturnCode(Enum):
32 """! @brief Enumeration of possible return codes for the application.
33
34 @param SUCCESS Indicates successful operation (value: 0)
35 @param BAD_INPUT Indicates invalid input parameters (value: 1)
36 @param THRESHOLD_EXCEEDED Indicates a threshold limit was exceeded (value: 2)
37 """
38 SUCCESS = 0
39 BAD_INPUT = 1
40 THRESHOLD_EXCEEDED = 2
41
42
44 """! @brief ANSI color codes for terminal text formatting.
45
46 Provides named constants for colored terminal output using ANSI escape sequences.
47 All colors should be used with ENDC to reset formatting.
48
49 Example:
50 print(f"{CliColors.OKGREEN}Success!{CliColors.ENDC}")
51 """
52 HEADER = "\033[95m"
53 OKBLUE = "\033[94m"
54 OKGREEN = "\033[92m"
55 WARNING = "\033[93m"
56 FAIL = "\033[91m"
57 ENDC = "\033[0m"
58 BOLD = "\033[1m"
59 UNDERLINE = "\033[4m"
60
61
62def get_all_source_code_from_directory(directory, file_extensions):
63 """! @brief Get a list with all the source code files within the directory
64
65 @param directory
66 @param file_extensions
67 @return source_code_files
68 """
69 source_code_files = list()
70 for dirpath, _, filenames in os.walk(directory):
71 for name in filenames:
72 _, file_extension = os.path.splitext(name)
73 if file_extension[1:] in file_extensions:
74 filename = os.path.join(dirpath, name)
75 source_code_files.append(filename)
76
77 return source_code_files
78
79
80def conditional_print(text, machine_friendly_output):
81 """! @brief
82
83 @param text
84 @param machine_friendly_output
85 """
86 if not machine_friendly_output:
87 print(text)
88
89
90def remove_comments_and_docstrings(source_code: str) -> str:
91 """! @brief Strip comments and docstrings from source code
92
93 @see https://gist.github.com/phpdude/1ae6f19de213d66286c8183e9e3b9ec1
94
95 @param source_code Raw source code as a single string
96 @return source_code_clean Stripped source code as a single string
97 """
98 parsed = ast.parse(source_code)
99 for node in ast.walk(parsed):
100 if not isinstance(
101 node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef, ast.Module)
102 ):
103 continue
104
105 if not len(node.body):
106 continue
107
108 if not isinstance(node.body[0], ast.Expr):
109 continue
110
111 if not hasattr(node.body[0], "value") or not isinstance(
112 node.body[0].value, ast.Str
113 ):
114 continue
115
116 node.body = node.body[1:]
117
118 source_code_clean = astor.to_source(parsed)
119 return source_code_clean
120
121
122def get_loc_count(file_path):
123 """! @brief
124
125 @param file_path
126 @return lines_count
127 """
128 lines_count = -1
129 try:
130 with open(os.path.normpath(file_path), 'r') as the_file:
131 lines_count = len(the_file.readlines())
132 except Exception as err:
133 print(f"WARNING: Failed to get lines count for file {file_path}, reason: {str(err)}")
134 return lines_count
135
136
137def get_loc_to_print(loc_count):
138 """! @brief
139
140 @param loc_count
141 @return loc_to_print
142 """
143 loc_to_print = str(loc_count) if loc_count >= 0 else ""
144 return loc_to_print
145
146
147def main():
148 """! @brief
149
150 @return result
151 """
152 parser_description = (
153 CliColors.HEADER
154 + CliColors.BOLD
155 + "=== Duplicate Code Detection Tool ==="
156 + CliColors.ENDC
157 )
158 parser = argparse.ArgumentParser(description=parser_description)
159 parser.add_argument(
160 "-t",
161 "--fail-threshold",
162 type=int,
163 default=100,
164 help="The maximum allowed similarity before the script exits with an error.",
165 )
166 group = parser.add_mutually_exclusive_group(required=True)
167 group.add_argument(
168 "-d",
169 "--directories",
170 nargs="+",
171 help="Check for similarities between all files of the specified directories.",
172 )
173 group.add_argument(
174 "-f",
175 "--files",
176 nargs="+",
177 help="Check for similarities between specified files. \
178 The more files are supplied the more accurate are the results.",
179 )
180 parser.add_argument(
181 "--ignore-directories", nargs="+", default=list(), help="Directories to ignore."
182 )
183 parser.add_argument("--ignore-files", nargs="+", help="Files to ignore.")
184 parser.add_argument(
185 "-j", "--json", type=bool, default=False, help="Print output as JSON."
186 )
187 parser.add_argument(
188 "--project-root-dir",
189 type=str,
190 default=str(),
191 help="The relative path to the project root directory to be removed when printing out results.",
192 )
193 parser.add_argument(
194 "--file-extensions",
195 nargs="+",
196 default=source_code_file_extensions,
197 help="File extensions to check for similarities.",
198 )
199 parser.add_argument(
200 "--ignore-threshold",
201 type=int,
202 default=0,
203 help="Don't print out similarity below the ignore threshold",
204 )
205 parser.add_argument(
206 "--only-code",
207 action="store_true",
208 help="Removes comments and docstrings from the source code before analysis",
209 )
210 parser.add_argument(
211 "--csv-output",
212 type=str,
213 default=str(),
214 help="Outputs results as a CSV to the specified CSV path",
215 )
216 parser.add_argument(
217 "--show-loc",
218 action="store_true",
219 help="Add file line counts, including blank lines and comments, to all outputs.",
220 )
221 args = parser.parse_args()
222
223 result = run(
224 args.fail_threshold,
225 args.directories,
226 args.files,
227 args.ignore_directories,
228 args.ignore_files,
229 args.json,
230 args.project_root_dir,
231 args.file_extensions,
232 args.ignore_threshold,
233 args.only_code,
234 args.csv_output,
235 args.show_loc,
236 )
237
238 return result
239
240
241def run(
242 fail_threshold,
243 directories,
244 files,
245 ignore_directories,
246 ignore_files,
247 json_output,
248 project_root_dir,
249 file_extensions,
250 ignore_threshold,
251 only_code,
252 csv_output,
253 show_loc,
254):
255 """! @brief
256
257 @param fail_threshold
258 @param directories
259 @param files
260 @param ignore_directories
261 @param ignore_files
262 @param json_output
263 @param project_root_dir
264 @param file_extensions
265 @param ignore_threshold
266 @param only_code
267 @param csv_output
268 @param show_loc
269 """
270 # Determine which files to compare for similarities
271 source_code_files = list()
272 files_to_ignore = list()
273 if directories:
274 for directory in directories:
275 if not os.path.isdir(directory):
276 print("Path does not exist or is not a directory:", directory)
277 return (ReturnCode.BAD_INPUT, {})
278 source_code_files += get_all_source_code_from_directory(
279 directory, file_extensions
280 )
281 for directory in ignore_directories:
282 files_to_ignore += get_all_source_code_from_directory(
283 directory, file_extensions
284 )
285 else:
286 if len(files) < 2:
287 print("Too few files to compare, you need to supply at least 2")
288 return (ReturnCode.BAD_INPUT, {})
289 for supplied_file in files:
290 if not os.path.isfile(supplied_file):
291 print("Supplied file does not exist:", supplied_file)
292 return (ReturnCode.BAD_INPUT, {})
293 source_code_files = files
294
295 files_to_ignore += ignore_files if ignore_files else list()
296 files_to_ignore = [os.path.normpath(f) for f in files_to_ignore]
297 source_code_files = [os.path.normpath(f) for f in source_code_files]
298 source_code_files = list(set(source_code_files) - set(files_to_ignore))
299 if len(source_code_files) < 2:
300 print("Not enough source code files found")
301 return (ReturnCode.BAD_INPUT, {})
302 # Sort the sources, so the results are sorted too and are reproducible
303 source_code_files.sort()
304 source_code_files = [os.path.abspath(f) for f in source_code_files]
305
306 # Get the absolute project root directory path to remove when printing out the results
307 if project_root_dir:
308 if not os.path.isdir(project_root_dir):
309 print(
310 "The project root directory does not exist or is not a directory:",
311 project_root_dir,
312 )
313 return (ReturnCode.BAD_INPUT, {})
314 project_root_dir = os.path.abspath(project_root_dir)
315 project_root_dir = os.path.join(project_root_dir, "") # Add the trailing slash
316
317 # Find the largest string length to format the textual output
318 largest_string_length = len(
319 max(source_code_files, key=len).replace(project_root_dir, "")
320 )
321
322 # Parse the contents of all the source files
323 source_code = OrderedDict()
324 for source_code_file in source_code_files:
325 try:
326 # read file but also recover from encoding errors in source files
327 with open(source_code_file, "r", errors="surrogateescape") as f:
328 # Store source code with the file path as the key
329 content = f.read()
330 if only_code and source_code_file.endswith("py"):
331 content = remove_comments_and_docstrings(content)
332 source_code[source_code_file] = content
333 except Exception as err:
334 print(f"ERROR: Failed to open file {source_code_file}, reason: {str(err)}")
335
336 # Create a Similarity object of all the source code
337 gen_docs = [
338 [word.lower() for word in word_tokenize(source_code[source_file])]
339 for source_file in source_code
340 ]
341 dictionary = gensim.corpora.Dictionary(gen_docs)
342 corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
343 tf_idf = gensim.models.TfidfModel(corpus)
344 sims = gensim.similarities.Similarity(
345 tempfile.gettempdir() + os.sep, tf_idf[corpus], num_features=len(dictionary)
346 )
347
348 column_label = file_column_label
349 if show_loc:
350 column_label += file_loc_label
351 largest_string_length += len(file_loc_label)
352
353 exit_code = ReturnCode.SUCCESS
354 code_similarity = dict()
355 for source_file in source_code:
356 # Check for similarities
357 query_doc = [w.lower() for w in word_tokenize(source_code[source_file])]
358 query_doc_bow = dictionary.doc2bow(query_doc)
359 query_doc_tf_idf = tf_idf[query_doc_bow]
360
361 loc_info = ""
362 source_file_loc = -1
363 if show_loc:
364 source_file_loc = get_loc_count(source_file)
365 loc_info = "," + get_loc_to_print(source_file_loc)
366
367 short_source_file_path = source_file.replace(project_root_dir, "")
369 "\n\n\n"
370 + CliColors.HEADER
371 + "Code duplication probability for "
372 + short_source_file_path
373 + loc_info
374 + CliColors.ENDC,
375 json_output,
376 )
378 "-" * (largest_string_length + similarity_label_length), json_output
379 )
381 CliColors.BOLD
382 + "%s %s"
383 % (column_label.center(largest_string_length), similarity_column_label)
384 + CliColors.ENDC,
385 json_output,
386 )
388 "-" * (largest_string_length + similarity_label_length), json_output
389 )
390
391 empty_length = 0
392 code_similarity[short_source_file_path] = dict()
393 if show_loc:
394 code_similarity[short_source_file_path][loc_label] = source_file_loc
395 empty_length = len(code_similarity[short_source_file_path])
396 for similarity, source in zip(sims[query_doc_tf_idf], source_code):
397 # Ignore similarities for the same file
398 if source == source_file:
399 continue
400 similarity_percentage = similarity * 100
401 # Ignore very low similarity
402 if similarity_percentage < ignore_threshold:
403 continue
404 short_source_path = source.replace(project_root_dir, "")
405 if show_loc:
406 code_similarity[short_source_file_path][short_source_path] = dict()
407 code_similarity[short_source_file_path][short_source_path][loc_label] = get_loc_count(
408 source
409 )
410 code_similarity[short_source_file_path][short_source_path][similarity_label] = round(
411 similarity_percentage, 2
412 )
413 else:
414 code_similarity[short_source_file_path][short_source_path] = round(
415 similarity_percentage, 2
416 )
417 if similarity_percentage > fail_threshold:
418 similarity_percentage = 100.00
419 color = (
420 CliColors.OKGREEN
421 if similarity_percentage < 10
422 else (
423 CliColors.WARNING if similarity_percentage < 20 else CliColors.FAIL
424 )
425 )
426 info_to_print = short_source_path
427 if show_loc:
428 info_to_print += "," + get_loc_to_print(get_loc_count(source))
429
431 "%s " % (info_to_print.ljust(largest_string_length))
432 + color
433 + "%.2f" % (similarity_percentage)
434 + CliColors.ENDC,
435 json_output,
436 )
437 # If no similarities found for the particular file, remove it from the report
438 if len(code_similarity[short_source_file_path]) == empty_length:
439 del code_similarity[short_source_file_path]
440 if exit_code == ReturnCode.THRESHOLD_EXCEEDED:
442 "Code duplication threshold exceeded. Please consult logs.", json_output
443 )
444
445 if json_output:
446 similarities_json = json.dumps(code_similarity, indent=4)
447 print(similarities_json)
448
449 if csv_output:
450 with open(csv_output, "w") as csv_file:
451 writer = csv.writer(csv_file)
452 if show_loc:
453 writer.writerow(["File A", "#LoC A", "File B", "#LoC B", "Similarity"])
454 for first_file in code_similarity:
455 for second_file in code_similarity[first_file]:
456 if second_file != loc_label:
457
458 writer.writerow(
459 [
460 first_file,
461 get_loc_to_print(get_loc_count(os.path.join(project_root_dir, first_file))),
462 second_file,
463 get_loc_to_print(get_loc_count(os.path.join(project_root_dir, second_file))),
464 code_similarity[first_file][second_file][similarity_label],
465 ]
466 )
467 else:
468 writer.writerow(["File A", "File B", "Similarity"])
469 for first_file in code_similarity:
470 for second_file in code_similarity[first_file]:
471 writer.writerow(
472 [
473 first_file,
474 second_file,
475 code_similarity[first_file][second_file],
476 ]
477 )
478
479 return (exit_code, code_similarity)
480
481
482if __name__ == "__main__":
483 exit_code, _ = main()
484 sys.exit(exit_code.value)
ANSI color codes for terminal text formatting.
Enumeration of possible return codes for the application.
conditional_print(text, machine_friendly_output)
str remove_comments_and_docstrings(str source_code)
Strip comments and docstrings from source code.
get_all_source_code_from_directory(directory, file_extensions)
Get a list with all the source code files within the directory.
run(fail_threshold, directories, files, ignore_directories, ignore_files, json_output, project_root_dir, file_extensions, ignore_threshold, only_code, csv_output, show_loc)