ArKanjo 0.2
A tool for find code duplicated functions in codebases
Loading...
Searching...
No Matches
diff_method.cpp
Go to the documentation of this file.
6
7using fm = FormatterManager;
8
9DiffMethod::DiffMethod(const fs::path& base_path_, double similarity_) {
10 base_path = base_path_;
11 similarity = similarity_;
12
13 if (similarity < 0) {
14 std::cerr << "SIMILARITY SHOULD BE GREATER OR EQUAL 0 TO USE DUPLICATION FINDER BY DIFF COMMAND";
15 }
16}
17
18std::vector<std::string> DiffMethod::find_files(const fs::path& folder_path) {
19 std::vector<std::string> file_paths;
20 for (const auto& dirEntry : std::filesystem::recursive_directory_iterator(folder_path)) {
21 fs::path file_path = dirEntry.path();
22 if (std::filesystem::exists(file_path) && std::filesystem::is_regular_file(file_path)) {
23 file_paths.push_back(file_path.string());
24 }
25 }
26 return file_paths;
27}
28
29bool DiffMethod::is_empty_line(std::string line) {
30 size_t line_size = line.size();
31 bool is_empty = true;
32 for (size_t i = 1; i < line_size; i++) {
33 is_empty &= Utils::is_empty_char(line[i]);
34 }
35 return is_empty;
36}
37
38std::vector<std::string> DiffMethod::remove_blank_lines(std::vector<std::string> content) {
39 std::vector<std::string> ret;
40 for (auto line : content) {
41 if (!is_empty_line(line)) {
42 ret.push_back(line);
43 }
44 }
45 return ret;
46}
47
48bool DiffMethod::is_equal_files(std::vector<std::string> content1, std::vector<std::string> content2) {
49 if (content1.size() != content2.size()) {
50 return false;
51 }
52 size_t sz = content1.size();
53 for (size_t i = 0; i < sz; i++) {
54 if (content1[i] != content2[i]) {
55 return false;
56 }
57 }
58 return true;
59}
60
61double DiffMethod::find_similarity(std::string path1, std::string path2) {
62 std::vector<std::string> content1 = Utils::read_file_with_vector(path1);
63 std::vector<std::string> content2 = Utils::read_file_with_vector(path2);
64
65 content1 = remove_blank_lines(content1);
66 content2 = remove_blank_lines(content2);
67
68 if (is_equal_files(content1, content2)) {
69 return 100;
70 }
71
72 // execute diff command to extract the differences
73 fs::path output_file = base_path / "diff.txt";
74
75 //-c is to print in the desired format
76 // 2>&1 is to also send standard error to the output_file
77 std::string command = "diff " + path1 + " " + path2 + " -c > " + output_file.string() + " 2>&1";
78 system(command.c_str());
79
80 std::vector<std::string> content = Utils::read_file_with_vector(output_file);
81
82 size_t number_lines = content.size();
83
84 int different_lines = 0;
85
86 for (size_t i = 4; i < number_lines; i++) {
87 auto& line = content[i];
88 if (is_empty_line(line)) {
89 continue;
90 }
91 if (line[0] == '!' || line[0] == '-' || line[0] == '+') {
92 different_lines += 1;
93 }
94 }
95
96 int equal_lines = (int)content1.size() + (int)content2.size();
97 equal_lines -= different_lines;
98
99 if (different_lines + equal_lines == 0) {
100 return -1;
101 }
102
103 double db_equal_lines = equal_lines;
104 double db_different_lines = different_lines;
105 double similarity_metric = db_equal_lines / (db_different_lines + db_equal_lines);
106 similarity_metric *= 100;
107 return similarity_metric;
108}
109
110std::vector<DuplicationEntry> DiffMethod::find_similar_pairs(std::vector<std::string>& file_paths) {
111 size_t number_files = file_paths.size();
112 std::vector<DuplicationEntry> ret;
113 for (size_t i = 0; i < number_files; i++) {
114 for (size_t j = 0; j < number_files; j++)
115 if (i != j) {
116 std::string file_path1 = file_paths[i];
117 std::string file_path2 = file_paths[j];
118 double similarity_metric = find_similarity(file_path1, file_path2);
119 if (similarity_metric >= similarity) {
120 ret.push_back({similarity_metric, file_path1, file_path2});
121 }
122 }
123 }
124 sort(ret.rbegin(), ret.rend());
125 return ret;
126}
127
128void DiffMethod::save_duplications(std::vector<DuplicationEntry>& file_duplication_pairs) {
129 std::string output_file_path = base_path / "output_parsed.txt";
130
131 auto fout = std::ofstream(output_file_path);
132
133 fout << file_duplication_pairs.size() << '\n';
134 for (const auto& [similarity, path1, path2] : file_duplication_pairs) {
135 fout << path1 << ' ' << path2 << ' ';
136 fout << std::fixed << std::setprecision(2) << similarity << '\n';
137 }
138
139 fout.close();
140}
141
143 fs::path base = base_path / source_feature_path;
144
145 auto source = fd.get_feature<SourceFeature>();
146 if (!source)
147 return;
148
149 fs::path relative(fd.path);
150 std::string filename = fd.function_name + relative.extension().string();
151 fs::path path = base / relative / filename;
152 Utils::write_file(path, source->code + "\n");
153}
154
156 fs::path base = base_path / source_feature_path;
157
158 std::vector<std::string> file_paths = find_files(base);
159
160 std::vector<DuplicationEntry> file_duplication_pairs = find_similar_pairs(file_paths);
161
162 fm::write(SAVING_MESSAGE);
163
164 save_duplications(file_duplication_pairs);
165}
void on_function(const FunctionData &fd) override
void execute() override
Executes the preprocessing pipeline.
DiffMethod(const fs::path &base_path_, double similarity_)
Constructs preprocessor with configuration.
static void write(const std::string &template_str, const std::vector< T > &data, enum Format effective=Format::AUTO, RowColorFn color_fn=nullptr, std::ostream &out=std::cout)
std::string function_name
Name of the function.
std::string path
std::shared_ptr< T > get_feature() const
Configuration management interface.
Code duplication preprocessing system.
void write_file(const fs::path &path, const std::string &content)
Writes content to a file at specified path.
Definition utils.cpp:19
std::vector< std::string > read_file_with_vector(const fs::path &path)
Reads a file line by line into a vector of strings.
Definition utils.cpp:26
bool is_empty_char(char c)
Checks if a character is considered empty/whitespace.
Definition utils.cpp:69
fs::path source_feature_path
Defines utility functions used across all files.