3PathId Similarity_Table::find_id_path(
const Path& path) {
4 auto [it, inserted] = path_id.try_emplace(path, paths.size());
8 similarity_graph.emplace_back();
14void Similarity_Table::read_comparation(std::ifstream& table_file) {
15 std::string string_path1, string_path2;
17 table_file >> string_path1 >> string_path2 >> similarity;
26 similarity_graph[id1].push_back(std::make_pair(id2, similarity));
27 similarity_graph[id2].push_back(std::make_pair(id1, similarity));
28 similarity_table[std::make_pair(id1, id2)] = similarity;
31void Similarity_Table::read_file_table(std::ifstream& table_file) {
32 int number_comparations;
33 table_file >> number_comparations;
34 for (
int i = 0; i < number_comparations; i++) {
35 read_comparation(table_file);
39void Similarity_Table::init_similarity_table() {
40 std::ifstream table_file;
42 table_file.open(similarity_table_file_name);
45 read_file_table(table_file);
51 : similarity_threshold{_similarity_threshold} { }
54 : similarity_threshold{DEFAULT_SIMILARITY} { }
57 init_similarity_table();
61 similarity_threshold = new_similarity_threshold;
65 PathId id1 = find_id_path(path1);
66 PathId id2 = find_id_path(path2);
69 return MAXIMUM_SIMILARITY;
74 std::pair<PathId, PathId> aux = std::make_pair(id1, id2);
75 if (similarity_table.find(aux) != similarity_table.end()) {
76 return similarity_table[aux];
78 return MINIMUM_SIMILARITY;
81bool Similarity_Table::is_above_threshold(
double similarity)
const {
82 return similarity_threshold <= similarity + EPS_ERROR_MARGIN;
87 return is_above_threshold(similarity);
95 int reference_id = find_id_path(reference);
96 std::vector<Path> ret;
97 for (
auto [neighbor_id, similarity] : similarity_graph[reference_id]) {
98 if (is_above_threshold(similarity)) {
99 ret.push_back(paths[neighbor_id]);
106 std::vector<std::tuple<double, Path, Path>> similar_path_pairs;
107 for (
auto [ids, similarity] : similarity_table) {
108 Path path1 = paths[ids.first];
109 Path path2 = paths[ids.second];
111 similar_path_pairs.push_back({similarity, path1, path2});
114 sort(similar_path_pairs.rbegin(), similar_path_pairs.rend());
115 return similar_path_pairs;
120 std::vector<std::pair<Path, Path>> ret;
121 for (
auto [similarity, path1, path2] : similar_path_pairs) {
122 ret.push_back({path1, path2});
127std::vector<std::tuple<int, Path, Path>> Similarity_Table::sort_pairs_by_line_number(
const std::vector<std::pair<Path, Path>>& similar_path_pairs)
const {
128 std::vector<std::tuple<int, Path, Path>> similar_path_pairs_with_number_of_lines;
129 for (
auto [path1, path2] : similar_path_pairs) {
132 std::tuple<int, Path, Path> aux = {function.number_of_lines(), path1, path2};
133 similar_path_pairs_with_number_of_lines.push_back(aux);
136 similar_path_pairs_with_number_of_lines.begin(),
137 similar_path_pairs_with_number_of_lines.end(),
138 [&](std::tuple<int, Path, Path> pair1, std::tuple<int, Path, Path> pair2) {
139 int number_lines1 = std::get<0>(pair1);
140 int number_lines2 = std::get<0>(pair2);
141 return number_lines1 > number_lines2;
143 return similar_path_pairs_with_number_of_lines;
149 std::vector<std::tuple<int, Path, Path>> similar_path_pairs_with_number_of_lines =
150 sort_pairs_by_line_number(similar_path_pairs);
152 std::vector<std::pair<Path, Path>> ret;
153 for (
auto [line_number, path1, path2] : similar_path_pairs_with_number_of_lines) {
154 ret.push_back({path1, path2});
169 int n = paths.size();
171 std::vector<bool> visited(n,
false);
172 std::vector<Cluster> clusters;
174 for (
int i = 0; i < n; i++) {
175 if (visited[i])
continue;
177 std::vector<int> stack;
178 std::vector<int> component;
183 while (!stack.empty()) {
184 int current = stack.back();
187 component.push_back(current);
189 for (
auto [neighbor, similarity] : similarity_graph[current]) {
190 if (!visited[neighbor] && is_above_threshold(similarity)) {
191 visited[neighbor] =
true;
192 stack.push_back(neighbor);
197 if (component.size() > 1) {
198 clusters.push_back({component});
207 std::vector<ClusterInfo> clusters_info;
209 for (
const auto& cluster : raw_clusters) {
212 for (
int id : cluster.members) {
213 info.
paths.push_back(paths[
id]);
216 for (
size_t i = 0; i < info.
paths.size(); i++) {
217 for (
size_t j = i + 1; j < info.
paths.size(); j++) {
227 clusters_info.push_back(info);
232 std::sort(clusters_info.begin(), clusters_info.end(),
234 return a.score() > b.score();
238 return clusters_info;
static Config & config()
Gets the singleton configuration instance.
fs::path base_path
Default base path for temporary files.
fs::path name_container
Name of the cache container.
Represents a code function with its content and metadata.
int number_of_lines() const
Calculates the total number of lines in the function.
Path manipulation class for tool-specific directory structure.
void update_similarity(double new_similarity_threshold)
Updates similarity threshold.
std::vector< std::tuple< double, Path, Path > > get_all_path_pairs_and_similarity_sorted_by_similarity()
Gets all similar path pairs with scores, sorted.
std::vector< ClusterInfo > get_clusters_info(bool sorted)
Returns detailed information about all clusters found in the similarity table.
int get_number_lines_in_pair(const Path &path1, const Path &path2)
Similarity_Table()
Constructs with default similarity threshold.
std::vector< std::pair< Path, Path > > get_all_similar_path_pairs_sorted_by_similarity()
Gets all similar path pairs, sorted by similarity.
double get_similarity(const Path &path1, const Path &path2)
Gets similarity between two paths.
std::vector< Cluster > get_clusters()
Generate clusters of similar functions using DFS on the similarity graph.
std::vector< std::pair< Path, Path > > get_all_similar_path_pairs_sorted_by_line_number()
Gets all similar path pairs, sorted by line count.
const std::vector< Path > & get_path_list() const
Gets list of all known paths.
bool is_similar(const Path &path1, const Path &path2)
Checks if two paths are similar.
std::vector< Path > get_similar_path_to_the_reference(const Path &reference)
Gets paths similar to reference path.
void ensure_file_is_open(const std::ifstream &file, const fs::path &file_name)
Ensures that a file stream is successfully opened.
Similarity relationships storage and analysis.
std::vector< Path > paths