7 base_path = base_path_;
8 similarity = similarity_;
11 std::cerr <<
"SIMILARITY SHOULD BE GREATER OR EQUAL 0 TO USE DUPLICATION FINDER BY DIFF COMMAND";
15vector<string> DuplicationFinderDiff::find_files(
const fs::path& folder_path) {
16 vector<string> file_paths;
17 for (
const auto& dirEntry : std::filesystem::recursive_directory_iterator(folder_path)) {
18 fs::path file_path = dirEntry.path();
19 if (std::filesystem::exists(file_path) && std::filesystem::is_regular_file(file_path)) {
20 file_paths.push_back(file_path.string());
26bool DuplicationFinderDiff::is_empty_line(
string line) {
27 size_t line_size = line.size();
29 for (
size_t i = 1; i < line_size; i++) {
35vector<string> DuplicationFinderDiff::remove_blank_lines(vector<string> content) {
37 for (
auto line : content) {
38 if (!is_empty_line(line)) {
45bool DuplicationFinderDiff::is_equal_files(vector<string> content1, vector<string> content2) {
46 if (content1.size() != content2.size()) {
49 size_t sz = content1.size();
50 for (
size_t i = 0; i < sz; i++) {
51 if (content1[i] != content2[i]) {
58double DuplicationFinderDiff::find_similarity(
string path1,
string path2) {
62 content1 = remove_blank_lines(content1);
63 content2 = remove_blank_lines(content2);
65 if (is_equal_files(content1, content2)) {
70 fs::path output_file = base_path /
"diff.txt";
74 string command =
"diff " + path1 +
" " + path2 +
" -c > " + output_file.string() +
" 2>&1";
75 system(command.c_str());
79 size_t number_lines = content.size();
81 int different_lines = 0;
83 for (
size_t i = 4; i < number_lines; i++) {
84 auto& line = content[i];
85 if (is_empty_line(line)) {
88 if (line[0] ==
'!' || line[0] ==
'-' || line[0] ==
'+') {
93 int equal_lines = (int)content1.size() + (int)content2.size();
94 equal_lines -= different_lines;
96 if (different_lines + equal_lines == 0) {
100 double db_equal_lines = equal_lines;
101 double db_different_lines = different_lines;
102 double similarity_metric = db_equal_lines / (db_different_lines + db_equal_lines);
103 similarity_metric *= 100;
104 return similarity_metric;
107vector<tuple<double, string, string>> DuplicationFinderDiff::find_similar_pairs(vector<string>& file_paths) {
108 size_t number_files = file_paths.size();
109 vector<tuple<double, string, string>> ret;
110 for (
size_t i = 0; i < number_files; i++) {
111 for (
size_t j = 0; j < number_files; j++)
113 string file_path1 = file_paths[i];
114 string file_path2 = file_paths[j];
115 double similarity_metric = find_similarity(file_path1, file_path2);
116 if (similarity_metric >= similarity) {
117 ret.push_back({similarity_metric, file_path1, file_path2});
121 sort(ret.rbegin(), ret.rend());
125void DuplicationFinderDiff::save_duplications(vector<tuple<double, string, string>>& file_duplication_pairs) {
126 string output_file_path = base_path /
"output_parsed.txt";
128 auto fout = ofstream(output_file_path);
130 fout << file_duplication_pairs.size() <<
'\n';
131 for (
const auto& [similarity, path1, path2] : file_duplication_pairs) {
132 fout << path1 <<
' ' << path2 <<
' ';
133 fout << fixed << setprecision(2) << similarity <<
'\n';
140 vector<string> file_paths = find_files(base_path /
"source");
142 vector<tuple<double, string, string>> file_duplication_pairs = find_similar_pairs(file_paths);
146 save_duplications(file_duplication_pairs);
void execute()
Executes the preprocessing pipeline.
DuplicationFinderDiff(const fs::path &base_path_, double similarity_)
Constructs preprocessor with configuration.
Code duplication preprocessing system.
bool is_empty_char(char c)
Checks if a character is considered empty/whitespace.
std::vector< std::string > read_file_generic(const fs::path &string_path)
Reads a file line by line into a vector of strings.