Arkanjo 0.1
A tool for find code duplicated functions in codebases
Loading...
Searching...
No Matches
duplication_finder_diff.cpp
Go to the documentation of this file.
2
3DuplicationFinderDiff::DuplicationFinderDiff(string base_path_, double similarity_){
4 base_path = base_path_;
5 similarity = similarity_;
6
7 if(similarity < 0){
8 std::cout << "SIMILARITY SHOULD BE GREATER OR EQUAL 0 TO USE DUPLICATION FINDER BY DIFF COMMAND";
9 }
10}
11
12vector<string> DuplicationFinderDiff::find_files(string folder_path){
13 vector<string> file_paths;
14 for(const auto &dirEntry: std::filesystem::recursive_directory_iterator(folder_path)){
15 string file_path = dirEntry.path().string();
16 if(Utils::does_file_exist(file_path) && Utils::is_regular_file(file_path)){
17 file_paths.push_back(file_path);
18 }
19 }
20 return file_paths;
21}
22
23bool DuplicationFinderDiff::is_empty_line(string line){
24 int line_size = line.size();
25 bool is_empty = true;
26 for(int i = 1; i < line_size; i++){
27 is_empty &= Utils::is_empty_char(line[i]);
28 }
29 return is_empty;
30}
31
32vector<string> DuplicationFinderDiff::remove_blank_lines(vector<string> content){
33 vector<string> ret;
34 for(auto line : content){
35 if(!is_empty_line(line)){
36 ret.push_back(line);
37 }
38 }
39 return ret;
40}
41
42bool DuplicationFinderDiff::is_equal_files(vector< string> content1, vector<string> content2){
43 if(content1.size() != content2.size()){
44 return false;
45 }
46 int sz = content1.size();
47 for(int i = 0; i < sz; i++){
48 if(content1[i] != content2[i]){
49 return false;
50 }
51 }
52 return true;
53}
54
55double DuplicationFinderDiff::find_similarity(string path1, string path2){
56 vector<string> content1 = Utils::read_file_generic(path1);
57 vector<string> content2 = Utils::read_file_generic(path2);
58
59 content1 = remove_blank_lines(content1);
60 content2 = remove_blank_lines(content2);
61
62 if(is_equal_files(content1,content2)){
63 return 100;
64 }
65
66 //execute diff command to extract the differences
67 string output_file = base_path+"/diff.txt";
68
69 //-c is to print in the desired format
70 //2>&1 is to also send standard error to the output_file
71 string command = "diff " + path1 + " " + path2 + " -c > " +output_file + " 2>&1";
72 system(command.c_str());
73
74 vector<string> content = Utils::read_file_generic(output_file);
75
76 int number_lines = content.size();
77
78 int different_lines = 0;
79
80 for(int i = 4; i < number_lines; i++){
81 auto& line = content[i];
82 if(is_empty_line(line)){
83 continue;
84 }
85 if(line[0] == '!' || line[0] == '-' || line[0] == '+'){
86 different_lines += 1;
87 }
88 }
89
90 int equal_lines = (int)content1.size() + (int)content2.size();
91 equal_lines -= different_lines;
92
93 if(different_lines+equal_lines == 0){
94 return -1;
95 }
96
97 double db_equal_lines = equal_lines;
98 double db_different_lines = different_lines;
99 double similarity_metric = db_equal_lines/(db_different_lines+db_equal_lines);
100 similarity_metric *= 100;
101 return similarity_metric;
102}
103
104vector<tuple<double,string,string>> DuplicationFinderDiff::find_similar_pairs(vector<string> &file_paths){
105 size_t number_files = file_paths.size();
106 vector<tuple<double,string,string>> ret;
107 for(size_t i = 0; i < number_files; i++){
108 for(size_t j = 0; j < number_files; j++) if(i != j){
109 string file_path1 = file_paths[i];
110 string file_path2 = file_paths[j];
111 double similarity_metric = find_similarity(file_path1,file_path2);
112 if( similarity_metric >= similarity){
113 ret.push_back({similarity_metric,file_path1,file_path2});
114 }
115 }
116 }
117 sort(ret.rbegin(),ret.rend());
118 return ret;
119}
120
121void DuplicationFinderDiff::save_duplications(vector<tuple<double,string,string>> &file_duplication_pairs){
122 string output_file_path = base_path+"/output_parsed.txt";
123
124 auto fout = ofstream(output_file_path);
125
126 fout << file_duplication_pairs.size() << '\n';
127 for(const auto &[similarity,path1,path2] : file_duplication_pairs){
128 fout << path1 << ' ' << path2 << ' ';
129 fout << fixed << setprecision(2) << similarity << '\n';
130 }
131
132 fout.close();
133}
134
136
137 vector<string> file_paths = find_files(base_path+"/source");
138
139 vector<tuple<double,string,string>> file_duplication_pairs = find_similar_pairs(file_paths);
140
141 cout << SAVING_MESSAGE << '\n';
142
143 save_duplications(file_duplication_pairs);
144}
void execute()
Executes the preprocessing pipeline.
DuplicationFinderDiff(string base_path_, double similarity_)
Constructs preprocessor with configuration.
return ret
Definition sum.c:3
Code duplication preprocessing system.
bool is_regular_file(string path)
Determines if a path refers to a regular file.
Definition utils.cpp:77
bool does_file_exist(string file_path)
Checks if a file exists at the given path.
Definition utils.cpp:68
bool is_empty_char(char c)
Checks if a character is considered empty/whitespace.
Definition utils.cpp:93
vector< string > read_file_generic(string string_path)
Reads a file line by line into a vector of strings.
Definition utils.cpp:19