Arkanjo 0.2
A tool for find code duplicated functions in codebases
Loading...
Searching...
No Matches
similarity_table.cpp
Go to the documentation of this file.
2
3PathId Similarity_Table::find_id_path(const Path& path) {
4 auto [it, inserted] = path_id.try_emplace(path, paths.size());
5
6 if (inserted) {
7 paths.push_back(path);
8 similarity_graph.emplace_back();
9 }
10
11 return it->second;
12}
13
14void Similarity_Table::read_comparation(std::ifstream& table_file) {
15 std::string string_path1, string_path2;
16 double similarity;
17 table_file >> string_path1 >> string_path2 >> similarity;
18
19 PathId id1 = find_id_path(Path(string_path1));
20 PathId id2 = find_id_path(Path(string_path2));
21
22 if (id1 > id2) {
23 std::swap(id1, id2);
24 }
25
26 similarity_graph[id1].push_back(std::make_pair(id2, similarity));
27 similarity_graph[id2].push_back(std::make_pair(id1, similarity));
28 similarity_table[std::make_pair(id1, id2)] = similarity;
29}
30
31void Similarity_Table::read_file_table(std::ifstream& table_file) {
32 int number_comparations;
33 table_file >> number_comparations;
34 for (int i = 0; i < number_comparations; i++) {
35 read_comparation(table_file);
36 }
37}
38
39void Similarity_Table::init_similarity_table() {
40 std::ifstream table_file;
41 const fs::path similarity_table_file_name = Config::config().base_path / Config::config().name_container / SIMILARITY_TABLE_FILE_NAME;
42 table_file.open(similarity_table_file_name);
43 Utils::ensure_file_is_open(table_file, similarity_table_file_name);
44
45 read_file_table(table_file);
46
47 table_file.close();
48}
49
50Similarity_Table::Similarity_Table(double _similarity_threshold)
51 : similarity_threshold{_similarity_threshold} { }
52
54 : similarity_threshold{DEFAULT_SIMILARITY} { }
55
57 init_similarity_table();
58}
59
60void Similarity_Table::update_similarity(double new_similarity_threshold) {
61 similarity_threshold = new_similarity_threshold;
62}
63
64double Similarity_Table::get_similarity(const Path& path1, const Path& path2) {
65 PathId id1 = find_id_path(path1);
66 PathId id2 = find_id_path(path2);
67
68 if (id1 == id2) {
69 return MAXIMUM_SIMILARITY;
70 }
71 if (id1 > id2) {
72 std::swap(id1, id2);
73 }
74 std::pair<PathId, PathId> aux = std::make_pair(id1, id2);
75 if (similarity_table.find(aux) != similarity_table.end()) {
76 return similarity_table[aux];
77 }
78 return MINIMUM_SIMILARITY;
79}
80
81bool Similarity_Table::is_above_threshold(double similarity) const {
82 return similarity_threshold <= similarity + EPS_ERROR_MARGIN;
83}
84
85bool Similarity_Table::is_similar(const Path& path1, const Path& path2) {
86 double similarity = get_similarity(path1, path2);
87 return is_above_threshold(similarity);
88}
89
90const std::vector<Path>& Similarity_Table::get_path_list() const {
91 return paths;
92}
93
94std::vector<Path> Similarity_Table::get_similar_path_to_the_reference(const Path& reference) {
95 int reference_id = find_id_path(reference);
96 std::vector<Path> ret;
97 for (auto [neighbor_id, similarity] : similarity_graph[reference_id]) {
98 if (is_above_threshold(similarity)) {
99 ret.push_back(paths[neighbor_id]);
100 }
101 }
102 return ret;
103}
104
106 std::vector<std::tuple<double, Path, Path>> similar_path_pairs;
107 for (auto [ids, similarity] : similarity_table) {
108 Path path1 = paths[ids.first];
109 Path path2 = paths[ids.second];
110 if (is_similar(path1, path2)) {
111 similar_path_pairs.push_back({similarity, path1, path2});
112 }
113 }
114 sort(similar_path_pairs.rbegin(), similar_path_pairs.rend());
115 return similar_path_pairs;
116}
117
120 std::vector<std::pair<Path, Path>> ret;
121 for (auto [similarity, path1, path2] : similar_path_pairs) {
122 ret.push_back({path1, path2});
123 }
124 return ret;
125}
126
127std::vector<std::tuple<int, Path, Path>> Similarity_Table::sort_pairs_by_line_number(const std::vector<std::pair<Path, Path>>& similar_path_pairs) const {
128 std::vector<std::tuple<int, Path, Path>> similar_path_pairs_with_number_of_lines;
129 for (auto [path1, path2] : similar_path_pairs) {
130 Function function(path1);
131 function.load();
132 std::tuple<int, Path, Path> aux = {function.number_of_lines(), path1, path2};
133 similar_path_pairs_with_number_of_lines.push_back(aux);
134 }
135 std::sort(
136 similar_path_pairs_with_number_of_lines.begin(),
137 similar_path_pairs_with_number_of_lines.end(),
138 [&](std::tuple<int, Path, Path> pair1, std::tuple<int, Path, Path> pair2) {
139 int number_lines1 = std::get<0>(pair1);
140 int number_lines2 = std::get<0>(pair2);
141 return number_lines1 > number_lines2;
142 });
143 return similar_path_pairs_with_number_of_lines;
144}
145
147 std::vector<std::pair<Path, Path>> similar_path_pairs = get_all_similar_path_pairs_sorted_by_similarity();
148
149 std::vector<std::tuple<int, Path, Path>> similar_path_pairs_with_number_of_lines =
150 sort_pairs_by_line_number(similar_path_pairs);
151
152 std::vector<std::pair<Path, Path>> ret;
153 for (auto [line_number, path1, path2] : similar_path_pairs_with_number_of_lines) {
154 ret.push_back({path1, path2});
155 }
156
157 return ret;
158}
159
160int Similarity_Table::get_number_lines_in_pair(const Path& path1, const Path& path2) {
161 Function f1(path1);
162 f1.load();
163 Function f2(path2);
164 f2.load();
165 return f1.number_of_lines() + f2.number_of_lines();
166}
167
168std::vector<Cluster> Similarity_Table::get_clusters() {
169 int n = paths.size();
170
171 std::vector<bool> visited(n, false);
172 std::vector<Cluster> clusters;
173
174 for (int i = 0; i < n; i++) {
175 if (visited[i]) continue;
176
177 std::vector<int> stack;
178 std::vector<int> component;
179
180 stack.push_back(i);
181 visited[i] = true;
182
183 while (!stack.empty()) {
184 int current = stack.back();
185 stack.pop_back();
186
187 component.push_back(current);
188
189 for (auto [neighbor, similarity] : similarity_graph[current]) {
190 if (!visited[neighbor] && is_above_threshold(similarity)) {
191 visited[neighbor] = true;
192 stack.push_back(neighbor);
193 }
194 }
195 }
196
197 if (component.size() > 1) {
198 clusters.push_back({component});
199 }
200 }
201
202 return clusters;
203}
204
205std::vector<ClusterInfo> Similarity_Table::get_clusters_info(bool sorted) {
206 auto raw_clusters = get_clusters();
207 std::vector<ClusterInfo> clusters_info;
208
209 for (const auto& cluster : raw_clusters) {
210 ClusterInfo info;
211
212 for (int id : cluster.members) {
213 info.paths.push_back(paths[id]);
214 }
215
216 for (size_t i = 0; i < info.paths.size(); i++) {
217 for (size_t j = i + 1; j < info.paths.size(); j++) {
218 int lines = get_number_lines_in_pair(info.paths[i], info.paths[j]);
219 if (lines > 0) {
220 info.total_pairs++;
221 info.total_lines += lines;
222 }
223 }
224 }
225
226 if (info.total_pairs > 0) {
227 clusters_info.push_back(info);
228 }
229 }
230
231 if (sorted) {
232 std::sort(clusters_info.begin(), clusters_info.end(),
233 [](const ClusterInfo& a, const ClusterInfo& b) {
234 return a.score() > b.score();
235 });
236 }
237
238 return clusters_info;
239}
static Config & config()
Gets the singleton configuration instance.
Definition config.cpp:37
fs::path base_path
Default base path for temporary files.
Definition config.hpp:42
fs::path name_container
Name of the cache container.
Definition config.hpp:44
Represents a code function with its content and metadata.
Definition function.hpp:30
void load()
Definition function.cpp:30
int number_of_lines() const
Calculates the total number of lines in the function.
Definition function.cpp:24
Path manipulation class for tool-specific directory structure.
Definition path.hpp:27
void update_similarity(double new_similarity_threshold)
Updates similarity threshold.
std::vector< std::tuple< double, Path, Path > > get_all_path_pairs_and_similarity_sorted_by_similarity()
Gets all similar path pairs with scores, sorted.
std::vector< ClusterInfo > get_clusters_info(bool sorted)
Returns detailed information about all clusters found in the similarity table.
int get_number_lines_in_pair(const Path &path1, const Path &path2)
Similarity_Table()
Constructs with default similarity threshold.
std::vector< std::pair< Path, Path > > get_all_similar_path_pairs_sorted_by_similarity()
Gets all similar path pairs, sorted by similarity.
double get_similarity(const Path &path1, const Path &path2)
Gets similarity between two paths.
std::vector< Cluster > get_clusters()
Generate clusters of similar functions using DFS on the similarity graph.
std::vector< std::pair< Path, Path > > get_all_similar_path_pairs_sorted_by_line_number()
Gets all similar path pairs, sorted by line count.
const std::vector< Path > & get_path_list() const
Gets list of all known paths.
bool is_similar(const Path &path1, const Path &path2)
Checks if two paths are similar.
std::vector< Path > get_similar_path_to_the_reference(const Path &reference)
Gets paths similar to reference path.
void ensure_file_is_open(const std::ifstream &file, const fs::path &file_name)
Ensures that a file stream is successfully opened.
Definition utils.cpp:5
Similarity relationships storage and analysis.
std::vector< Path > paths