19 out.
type = ts_node_type(node);
21 uint32_t count = ts_node_named_child_count(node);
23 for (uint32_t i = 0; i < count; i++) {
33 for (
const auto& child : node.
children) {
41 int id = tree.
labels.size();
49 tree.
lmd.push_back(leftmost);
55 size_t n = a.
labels.size();
56 size_t m = b.
labels.size();
58 std::vector<std::vector<int>> dp(n + 1, std::vector<int>(m + 1));
60 for (
size_t i = 0; i <= n; i++) {
64 for (
size_t j = 0; j <= m; j++) {
68 for (
size_t i = 1; i <= n; i++) {
69 for (
size_t j = 1; j <= m; j++) {
71 int rename_cost = a.
labels[i - 1] == b.
labels[j - 1] ? 0 : 1;
76 dp[i - 1][j - 1] + rename_cost
85 int dist = tree_distance(a, b);
87 int max_size = std::max(a.
labels.size(), b.
labels.size());
89 return 1.0 - (
static_cast<double>(dist) / max_size);
93 base_path = base_path_;
94 similarity = similarity_;
97 std::cerr <<
"SIMILARITY SHOULD BE GREATER OR EQUAL 0 TO USE DUPLICATION FINDER BY AST COMMAND";
101void ASTMethod::save_duplications(std::vector<DuplicationEntry>& file_duplication_pairs) {
102 std::string output_file_path = base_path /
"output_parsed.txt";
104 auto fout = std::ofstream(output_file_path);
106 fout << file_duplication_pairs.size() <<
'\n';
107 for (
const auto& [similarity, path1, path2] : file_duplication_pairs) {
108 fout << path1 <<
' ' << path2 <<
' ';
109 fout << std::fixed << std::setprecision(2) << similarity <<
'\n';
115std::vector<DuplicationEntry> ASTMethod::compare_range(
116 const std::vector<PostOrderTree>& processed,
117 size_t begin,
size_t end
119 std::vector<DuplicationEntry> local;
121 for (
size_t i = begin; i < end; i++) {
122 for (
size_t j = i + 1; j < processed.size(); j++) {
123 double sim = similarity_score(processed[i], processed[j]) * 100.0;
125 if (sim >= similarity)
126 local.push_back({sim, processed[i].path, processed[j].path});
143 fs::path path{fd.
path};
144 std::string filename = fd.
function_name + path.extension().string();
146 p.
path = path / filename;
148 processed.push_back(p);
152 unsigned int thread_count = std::thread::hardware_concurrency();
153 const size_t threads = std::max<size_t>(1, thread_count);
155 size_t n = processed.size();
156 size_t chunk = (n + threads - 1) / threads;
158 std::vector<std::thread> workers;
159 std::vector<std::vector<DuplicationEntry>> results(threads);
161 for (
size_t t = 0; t < threads; t++) {
162 size_t begin = t * chunk;
163 size_t end = std::min(begin + chunk, n);
168 workers.emplace_back([&, t, begin, end]() {
169 results[t] = compare_range(processed, begin, end);
173 for (
auto& w : workers)
176 std::vector<DuplicationEntry> duplications;
178 for (
auto& r : results) {
179 duplications.insert(duplications.end(), r.begin(), r.end());
182 save_duplications(duplications);
ZSNode from_tsnode(TSNode node)
int build_postorder(const ZSNode &node, PostOrderTree &tree)
ASTMethod(const fs::path &base_path_, double similarity_)
Constructs preprocessor with configuration.
void on_function(const FunctionData &fd) override
void execute() override
Executes the preprocessing pipeline.
std::string function_name
Name of the function.
std::shared_ptr< T > get_feature() const
Configuration management interface.
std::vector< std::string > labels
std::vector< ZSNode > children
Defines utility functions used across all files.