ArKanjo 0.2
A tool for find code duplicated functions in codebases
Loading...
Searching...
No Matches
ast_method.cpp
Go to the documentation of this file.
7
8#include <thread>
9
10using fm = FormatterManager;
11
12struct ZSNode {
13 std::string type;
14 std::vector<ZSNode> children;
15};
16
17ZSNode from_tsnode(TSNode node) {
18 ZSNode out;
19 out.type = ts_node_type(node);
20
21 uint32_t count = ts_node_named_child_count(node);
22
23 for (uint32_t i = 0; i < count; i++) {
24 out.children.push_back(from_tsnode(ts_node_named_child(node, i)));
25 }
26
27 return out;
28}
29
30int build_postorder(const ZSNode& node, PostOrderTree& tree) {
31 int leftmost = -1;
32
33 for (const auto& child : node.children) {
34 int child_lmd = build_postorder(child, tree);
35
36 if (leftmost == -1) {
37 leftmost = child_lmd;
38 }
39 }
40
41 int id = tree.labels.size();
42
43 tree.labels.push_back(node.type);
44
45 if (leftmost == -1) {
46 leftmost = id;
47 }
48
49 tree.lmd.push_back(leftmost);
50
51 return leftmost;
52}
53
54int ASTMethod::tree_distance(const PostOrderTree& a, const PostOrderTree& b) {
55 size_t n = a.labels.size();
56 size_t m = b.labels.size();
57
58 std::vector<std::vector<int>> dp(n + 1, std::vector<int>(m + 1));
59
60 for (size_t i = 0; i <= n; i++) {
61 dp[i][0] = i;
62 }
63
64 for (size_t j = 0; j <= m; j++) {
65 dp[0][j] = j;
66 }
67
68 for (size_t i = 1; i <= n; i++) {
69 for (size_t j = 1; j <= m; j++) {
70
71 int rename_cost = a.labels[i - 1] == b.labels[j - 1] ? 0 : 1;
72
73 dp[i][j] = std::min({
74 dp[i - 1][j] + 1,
75 dp[i][j - 1] + 1,
76 dp[i - 1][j - 1] + rename_cost
77 });
78 }
79 }
80
81 return dp[n][m];
82}
83
84double ASTMethod::similarity_score(const PostOrderTree& a, const PostOrderTree& b) {
85 int dist = tree_distance(a, b);
86
87 int max_size = std::max(a.labels.size(), b.labels.size());
88
89 return 1.0 - (static_cast<double>(dist) / max_size);
90}
91
92ASTMethod::ASTMethod(const fs::path& base_path_, double similarity_) {
93 base_path = base_path_;
94 similarity = similarity_;
95
96 if (similarity < 0) {
97 std::cerr << "SIMILARITY SHOULD BE GREATER OR EQUAL 0 TO USE DUPLICATION FINDER BY AST COMMAND";
98 }
99}
100
101void ASTMethod::save_duplications(std::vector<DuplicationEntry>& file_duplication_pairs) {
102 std::string output_file_path = base_path / "output_parsed.txt";
103
104 auto fout = std::ofstream(output_file_path);
105
106 fout << file_duplication_pairs.size() << '\n';
107 for (const auto& [similarity, path1, path2] : file_duplication_pairs) {
108 fout << path1 << ' ' << path2 << ' ';
109 fout << std::fixed << std::setprecision(2) << similarity << '\n';
110 }
111
112 fout.close();
113}
114
115std::vector<DuplicationEntry> ASTMethod::compare_range(
116 const std::vector<PostOrderTree>& processed,
117 size_t begin, size_t end
118) {
119 std::vector<DuplicationEntry> local;
120
121 for (size_t i = begin; i < end; i++) {
122 for (size_t j = i + 1; j < processed.size(); j++) {
123 double sim = similarity_score(processed[i], processed[j]) * 100.0;
124
125 if (sim >= similarity)
126 local.push_back({sim, processed[i].path, processed[j].path});
127 }
128 }
129
130 return local;
131}
132
134 auto ast = fd.get_feature<ASTFeature>();
135 if (!ast)
136 return;
137
138 auto tree = from_tsnode(ast->root);
139
141 build_postorder(tree, p);
142
143 fs::path path{fd.path};
144 std::string filename = fd.function_name + path.extension().string();
145
146 p.path = path / filename;
147
148 processed.push_back(p);
149}
150
152 unsigned int thread_count = std::thread::hardware_concurrency();
153 const size_t threads = std::max<size_t>(1, thread_count);
154
155 size_t n = processed.size();
156 size_t chunk = (n + threads - 1) / threads;
157
158 std::vector<std::thread> workers;
159 std::vector<std::vector<DuplicationEntry>> results(threads);
160
161 for (size_t t = 0; t < threads; t++) {
162 size_t begin = t * chunk;
163 size_t end = std::min(begin + chunk, n);
164
165 if (begin >= end)
166 break;
167
168 workers.emplace_back([&, t, begin, end]() {
169 results[t] = compare_range(processed, begin, end);
170 });
171 }
172
173 for (auto& w : workers)
174 w.join();
175
176 std::vector<DuplicationEntry> duplications;
177
178 for (auto& r : results) {
179 duplications.insert(duplications.end(), r.begin(), r.end());
180 }
181
182 save_duplications(duplications);
183}
ZSNode from_tsnode(TSNode node)
int build_postorder(const ZSNode &node, PostOrderTree &tree)
ASTMethod(const fs::path &base_path_, double similarity_)
Constructs preprocessor with configuration.
void on_function(const FunctionData &fd) override
void execute() override
Executes the preprocessing pipeline.
std::string function_name
Name of the function.
std::string path
std::shared_ptr< T > get_feature() const
Configuration management interface.
fs::path path
std::vector< int > lmd
std::vector< std::string > labels
std::vector< ZSNode > children
std::string type
Defines utility functions used across all files.