Arkanjo 0.2
A tool for find code duplicated functions in codebases
Loading...
Searching...
No Matches
similarity_table.hpp
Go to the documentation of this file.
1
13#pragma once
14
15#include <fstream>
16#include <map>
17#include <string>
18#include <tuple>
19#include <utility>
20#include <vector>
21
23#include <arkanjo/base/path.hpp>
26
27struct PathId {
28 int value;
29
30 explicit PathId(int v = -1) : value(v) {}
31
32 bool operator==(const PathId& other) const {
33 return value == other.value;
34 }
35
36 bool operator!=(const PathId& other) const {
37 return value != other.value;
38 }
39
40 bool operator<(const PathId& other) const {
41 return value < other.value;
42 }
43
44 operator int() const { return value; }
45};
46
55template<typename Key, typename Weight>
56using AdjacencyList = std::vector<std::vector<std::pair<Key, Weight>>>;
57
62struct Cluster {
63 std::vector<int> members;
64};
65
67 std::vector<Path> paths;
68 int total_lines = 0;
69 int total_pairs = 0;
70
71 double score() const {
72 double w_files = 0.3;
73 double w_lines = 0.4;
74 double w_density = 0.3;
75 double d = total_pairs > 0 ? (double)total_lines / total_pairs : 0;
76 return w_files * paths.size() + w_lines * total_lines + w_density * d;
77 }
78};
79
100 private:
101 static constexpr const char* SIMILARITY_TABLE_FILE_NAME = "output_parsed.txt";
102 static constexpr const double DEFAULT_SIMILARITY = 100.00;
103 static constexpr const double EPS_ERROR_MARGIN = 1e-6;
104 static constexpr const double MAXIMUM_SIMILARITY = 100.00;
105 static constexpr const double MINIMUM_SIMILARITY = 0.00;
106
107 double similarity_threshold;
108 std::vector<Path> paths;
109 std::map<Path, PathId> path_id;
110
114 AdjacencyList<PathId, double> similarity_graph;
115 std::map<std::pair<PathId, PathId>, double> similarity_table;
116
122 PathId find_id_path(const Path& path);
123
131 void read_comparation(std::ifstream& table_file);
132
137 void read_file_table(std::ifstream& table_file);
138
142 void init_similarity_table();
143
149 bool is_above_threshold(double similarity) const;
150
156 std::vector<std::tuple<int, Path, Path>> sort_pairs_by_line_number(const std::vector<std::pair<Path, Path>>& similar_path_pairs) const;
157
158 public:
163 explicit Similarity_Table(double _similarity_threshold);
164
168 explicit Similarity_Table();
169
170 void load();
171
176 void update_similarity(double new_similarity_threshold);
177
184 double get_similarity(const Path& path1, const Path& path2);
185
192 bool is_similar(const Path& path1, const Path& path2);
193
198 const std::vector<Path>& get_path_list() const;
199
200 int get_number_lines_in_pair(const Path& path1, const Path& path2);
201
207 std::vector<Path> get_similar_path_to_the_reference(const Path& reference);
208
213 std::vector<std::tuple<double, Path, Path>> get_all_path_pairs_and_similarity_sorted_by_similarity();
214
219 std::vector<std::pair<Path, Path>> get_all_similar_path_pairs_sorted_by_similarity();
220
225 std::vector<std::pair<Path, Path>> get_all_similar_path_pairs_sorted_by_line_number();
226
232 std::vector<Cluster> get_clusters();
233
239 std::vector<ClusterInfo> get_clusters_info(bool sorted);
240};
Path manipulation class for tool-specific directory structure.
Definition path.hpp:27
Represents a similarity graph between functions (paths).
void update_similarity(double new_similarity_threshold)
Updates similarity threshold.
std::vector< std::tuple< double, Path, Path > > get_all_path_pairs_and_similarity_sorted_by_similarity()
Gets all similar path pairs with scores, sorted.
std::vector< ClusterInfo > get_clusters_info(bool sorted)
Returns detailed information about all clusters found in the similarity table.
int get_number_lines_in_pair(const Path &path1, const Path &path2)
Similarity_Table()
Constructs with default similarity threshold.
std::vector< std::pair< Path, Path > > get_all_similar_path_pairs_sorted_by_similarity()
Gets all similar path pairs, sorted by similarity.
double get_similarity(const Path &path1, const Path &path2)
Gets similarity between two paths.
std::vector< Cluster > get_clusters()
Generate clusters of similar functions using DFS on the similarity graph.
std::vector< std::pair< Path, Path > > get_all_similar_path_pairs_sorted_by_line_number()
Gets all similar path pairs, sorted by line count.
const std::vector< Path > & get_path_list() const
Gets list of all known paths.
bool is_similar(const Path &path1, const Path &path2)
Checks if two paths are similar.
std::vector< Path > get_similar_path_to_the_reference(const Path &reference)
Gets paths similar to reference path.
Configuration management interface.
Function abstraction for temporary codebase.
Path abstraction for temporary codebase.
std::vector< std::vector< std::pair< Key, Weight > > > AdjacencyList
{ 1: [{ key, weight }, { key, weight }] 2: [{ key, weight }] 3: [] ... }
std::vector< Path > paths
double score() const
Represents a cluster of similar functions in the similarity graph.
std::vector< int > members
List of node indices (Path IDs) in the cluster.
bool operator==(const PathId &other) const
bool operator!=(const PathId &other) const
PathId(int v=-1)
bool operator<(const PathId &other) const
Defines utility functions used across all files.