Arkanjo 0.2
A tool for find code duplicated functions in codebases
Loading...
Searching...
No Matches
duplication_finder_diff.cpp
Go to the documentation of this file.
3
4using fm = FormatterManager;
5
6DuplicationFinderDiff::DuplicationFinderDiff(const fs::path& base_path_, double similarity_) {
7 base_path = base_path_;
8 similarity = similarity_;
9
10 if (similarity < 0) {
11 std::cerr << "SIMILARITY SHOULD BE GREATER OR EQUAL 0 TO USE DUPLICATION FINDER BY DIFF COMMAND";
12 }
13}
14
15vector<string> DuplicationFinderDiff::find_files(const fs::path& folder_path) {
16 vector<string> file_paths;
17 for (const auto& dirEntry : std::filesystem::recursive_directory_iterator(folder_path)) {
18 fs::path file_path = dirEntry.path();
19 if (std::filesystem::exists(file_path) && std::filesystem::is_regular_file(file_path)) {
20 file_paths.push_back(file_path.string());
21 }
22 }
23 return file_paths;
24}
25
26bool DuplicationFinderDiff::is_empty_line(string line) {
27 size_t line_size = line.size();
28 bool is_empty = true;
29 for (size_t i = 1; i < line_size; i++) {
30 is_empty &= Utils::is_empty_char(line[i]);
31 }
32 return is_empty;
33}
34
35vector<string> DuplicationFinderDiff::remove_blank_lines(vector<string> content) {
36 vector<string> ret;
37 for (auto line : content) {
38 if (!is_empty_line(line)) {
39 ret.push_back(line);
40 }
41 }
42 return ret;
43}
44
45bool DuplicationFinderDiff::is_equal_files(vector<string> content1, vector<string> content2) {
46 if (content1.size() != content2.size()) {
47 return false;
48 }
49 size_t sz = content1.size();
50 for (size_t i = 0; i < sz; i++) {
51 if (content1[i] != content2[i]) {
52 return false;
53 }
54 }
55 return true;
56}
57
58double DuplicationFinderDiff::find_similarity(string path1, string path2) {
59 vector<string> content1 = Utils::read_file_generic(path1);
60 vector<string> content2 = Utils::read_file_generic(path2);
61
62 content1 = remove_blank_lines(content1);
63 content2 = remove_blank_lines(content2);
64
65 if (is_equal_files(content1, content2)) {
66 return 100;
67 }
68
69 // execute diff command to extract the differences
70 fs::path output_file = base_path / "diff.txt";
71
72 //-c is to print in the desired format
73 // 2>&1 is to also send standard error to the output_file
74 string command = "diff " + path1 + " " + path2 + " -c > " + output_file.string() + " 2>&1";
75 system(command.c_str());
76
77 vector<string> content = Utils::read_file_generic(output_file);
78
79 size_t number_lines = content.size();
80
81 int different_lines = 0;
82
83 for (size_t i = 4; i < number_lines; i++) {
84 auto& line = content[i];
85 if (is_empty_line(line)) {
86 continue;
87 }
88 if (line[0] == '!' || line[0] == '-' || line[0] == '+') {
89 different_lines += 1;
90 }
91 }
92
93 int equal_lines = (int)content1.size() + (int)content2.size();
94 equal_lines -= different_lines;
95
96 if (different_lines + equal_lines == 0) {
97 return -1;
98 }
99
100 double db_equal_lines = equal_lines;
101 double db_different_lines = different_lines;
102 double similarity_metric = db_equal_lines / (db_different_lines + db_equal_lines);
103 similarity_metric *= 100;
104 return similarity_metric;
105}
106
107vector<tuple<double, string, string>> DuplicationFinderDiff::find_similar_pairs(vector<string>& file_paths) {
108 size_t number_files = file_paths.size();
109 vector<tuple<double, string, string>> ret;
110 for (size_t i = 0; i < number_files; i++) {
111 for (size_t j = 0; j < number_files; j++)
112 if (i != j) {
113 string file_path1 = file_paths[i];
114 string file_path2 = file_paths[j];
115 double similarity_metric = find_similarity(file_path1, file_path2);
116 if (similarity_metric >= similarity) {
117 ret.push_back({similarity_metric, file_path1, file_path2});
118 }
119 }
120 }
121 sort(ret.rbegin(), ret.rend());
122 return ret;
123}
124
125void DuplicationFinderDiff::save_duplications(vector<tuple<double, string, string>>& file_duplication_pairs) {
126 string output_file_path = base_path / "output_parsed.txt";
127
128 auto fout = ofstream(output_file_path);
129
130 fout << file_duplication_pairs.size() << '\n';
131 for (const auto& [similarity, path1, path2] : file_duplication_pairs) {
132 fout << path1 << ' ' << path2 << ' ';
133 fout << fixed << setprecision(2) << similarity << '\n';
134 }
135
136 fout.close();
137}
138
140 vector<string> file_paths = find_files(base_path / "source");
141
142 vector<tuple<double, string, string>> file_duplication_pairs = find_similar_pairs(file_paths);
143
144 fm::write(SAVING_MESSAGE);
145
146 save_duplications(file_duplication_pairs);
147}
void execute()
Executes the preprocessing pipeline.
DuplicationFinderDiff(const fs::path &base_path_, double similarity_)
Constructs preprocessor with configuration.
static void write(const std::string &template_str, const std::vector< T > &data, enum Format effective=Format::AUTO, RowColorFn color_fn=nullptr, std::ostream &out=std::cout)
Code duplication preprocessing system.
bool is_empty_char(char c)
Checks if a character is considered empty/whitespace.
Definition utils.cpp:56
std::vector< std::string > read_file_generic(const fs::path &string_path)
Reads a file line by line into a vector of strings.
Definition utils.cpp:13