10 base_path = base_path_;
11 similarity = similarity_;
14 std::cerr <<
"SIMILARITY SHOULD BE GREATER OR EQUAL 0 TO USE DUPLICATION FINDER BY DIFF COMMAND";
18std::vector<std::string> DiffMethod::find_files(
const fs::path& folder_path) {
19 std::vector<std::string> file_paths;
20 for (
const auto& dirEntry : std::filesystem::recursive_directory_iterator(folder_path)) {
21 fs::path file_path = dirEntry.path();
22 if (std::filesystem::exists(file_path) && std::filesystem::is_regular_file(file_path)) {
23 file_paths.push_back(file_path.string());
29bool DiffMethod::is_empty_line(std::string line) {
30 size_t line_size = line.size();
32 for (
size_t i = 1; i < line_size; i++) {
38std::vector<std::string> DiffMethod::remove_blank_lines(std::vector<std::string> content) {
39 std::vector<std::string> ret;
40 for (
auto line : content) {
41 if (!is_empty_line(line)) {
48bool DiffMethod::is_equal_files(std::vector<std::string> content1, std::vector<std::string> content2) {
49 if (content1.size() != content2.size()) {
52 size_t sz = content1.size();
53 for (
size_t i = 0; i < sz; i++) {
54 if (content1[i] != content2[i]) {
61double DiffMethod::find_similarity(std::string path1, std::string path2) {
65 content1 = remove_blank_lines(content1);
66 content2 = remove_blank_lines(content2);
68 if (is_equal_files(content1, content2)) {
73 fs::path output_file = base_path /
"diff.txt";
77 std::string command =
"diff " + path1 +
" " + path2 +
" -c > " + output_file.string() +
" 2>&1";
78 system(command.c_str());
82 size_t number_lines = content.size();
84 int different_lines = 0;
86 for (
size_t i = 4; i < number_lines; i++) {
87 auto& line = content[i];
88 if (is_empty_line(line)) {
91 if (line[0] ==
'!' || line[0] ==
'-' || line[0] ==
'+') {
96 int equal_lines = (int)content1.size() + (int)content2.size();
97 equal_lines -= different_lines;
99 if (different_lines + equal_lines == 0) {
103 double db_equal_lines = equal_lines;
104 double db_different_lines = different_lines;
105 double similarity_metric = db_equal_lines / (db_different_lines + db_equal_lines);
106 similarity_metric *= 100;
107 return similarity_metric;
110std::vector<DuplicationEntry> DiffMethod::find_similar_pairs(std::vector<std::string>& file_paths) {
111 size_t number_files = file_paths.size();
112 std::vector<DuplicationEntry> ret;
113 for (
size_t i = 0; i < number_files; i++) {
114 for (
size_t j = 0; j < number_files; j++)
116 std::string file_path1 = file_paths[i];
117 std::string file_path2 = file_paths[j];
118 double similarity_metric = find_similarity(file_path1, file_path2);
119 if (similarity_metric >= similarity) {
120 ret.push_back({similarity_metric, file_path1, file_path2});
124 sort(ret.rbegin(), ret.rend());
128void DiffMethod::save_duplications(std::vector<DuplicationEntry>& file_duplication_pairs) {
129 std::string output_file_path = base_path /
"output_parsed.txt";
131 auto fout = std::ofstream(output_file_path);
133 fout << file_duplication_pairs.size() <<
'\n';
134 for (
const auto& [similarity, path1, path2] : file_duplication_pairs) {
135 fout << path1 <<
' ' << path2 <<
' ';
136 fout << std::fixed << std::setprecision(2) << similarity <<
'\n';
149 fs::path relative(fd.
path);
150 std::string filename = fd.
function_name + relative.extension().string();
151 fs::path path = base / relative / filename;
158 std::vector<std::string> file_paths = find_files(base);
160 std::vector<DuplicationEntry> file_duplication_pairs = find_similar_pairs(file_paths);
164 save_duplications(file_duplication_pairs);
void on_function(const FunctionData &fd) override
void execute() override
Executes the preprocessing pipeline.
DiffMethod(const fs::path &base_path_, double similarity_)
Constructs preprocessor with configuration.
std::string function_name
Name of the function.
std::shared_ptr< T > get_feature() const
Configuration management interface.
Code duplication preprocessing system.
void write_file(const fs::path &path, const std::string &content)
Writes content to a file at specified path.
std::vector< std::string > read_file_with_vector(const fs::path &path)
Reads a file line by line into a vector of strings.
bool is_empty_char(char c)
Checks if a character is considered empty/whitespace.
fs::path source_feature_path
Defines utility functions used across all files.