4 base_path = base_path_;
5 similarity = similarity_;
8 std::cout <<
"SIMILARITY SHOULD BE GREATER OR EQUAL 0 TO USE DUPLICATION FINDER BY DIFF COMMAND";
12vector<string> DuplicationFinderDiff::find_files(
string folder_path){
13 vector<string> file_paths;
14 for(
const auto &dirEntry: std::filesystem::recursive_directory_iterator(folder_path)){
15 string file_path = dirEntry.path().string();
17 file_paths.push_back(file_path);
23bool DuplicationFinderDiff::is_empty_line(
string line){
24 int line_size = line.size();
26 for(
int i = 1; i < line_size; i++){
32vector<string> DuplicationFinderDiff::remove_blank_lines(vector<string> content){
34 for(
auto line : content){
35 if(!is_empty_line(line)){
42bool DuplicationFinderDiff::is_equal_files(vector< string> content1, vector<string> content2){
43 if(content1.size() != content2.size()){
46 int sz = content1.size();
47 for(
int i = 0; i < sz; i++){
48 if(content1[i] != content2[i]){
55double DuplicationFinderDiff::find_similarity(
string path1,
string path2){
59 content1 = remove_blank_lines(content1);
60 content2 = remove_blank_lines(content2);
62 if(is_equal_files(content1,content2)){
67 string output_file = base_path+
"/diff.txt";
71 string command =
"diff " + path1 +
" " + path2 +
" -c > " +output_file +
" 2>&1";
72 system(command.c_str());
76 int number_lines = content.size();
78 int different_lines = 0;
80 for(
int i = 4; i < number_lines; i++){
81 auto& line = content[i];
82 if(is_empty_line(line)){
85 if(line[0] ==
'!' || line[0] ==
'-' || line[0] ==
'+'){
90 int equal_lines = (int)content1.size() + (int)content2.size();
91 equal_lines -= different_lines;
93 if(different_lines+equal_lines == 0){
97 double db_equal_lines = equal_lines;
98 double db_different_lines = different_lines;
99 double similarity_metric = db_equal_lines/(db_different_lines+db_equal_lines);
100 similarity_metric *= 100;
101 return similarity_metric;
104vector<tuple<double,string,string>> DuplicationFinderDiff::find_similar_pairs(vector<string> &file_paths){
105 size_t number_files = file_paths.size();
106 vector<tuple<double,string,string>>
ret;
107 for(
size_t i = 0; i < number_files; i++){
108 for(
size_t j = 0; j < number_files; j++)
if(i != j){
109 string file_path1 = file_paths[i];
110 string file_path2 = file_paths[j];
111 double similarity_metric = find_similarity(file_path1,file_path2);
112 if( similarity_metric >= similarity){
113 ret.push_back({similarity_metric,file_path1,file_path2});
117 sort(
ret.rbegin(),
ret.rend());
121void DuplicationFinderDiff::save_duplications(vector<tuple<double,string,string>> &file_duplication_pairs){
122 string output_file_path = base_path+
"/output_parsed.txt";
124 auto fout = ofstream(output_file_path);
126 fout << file_duplication_pairs.size() <<
'\n';
127 for(
const auto &[similarity,path1,path2] : file_duplication_pairs){
128 fout << path1 <<
' ' << path2 <<
' ';
129 fout << fixed << setprecision(2) << similarity <<
'\n';
137 vector<string> file_paths = find_files(base_path+
"/source");
139 vector<tuple<double,string,string>> file_duplication_pairs = find_similar_pairs(file_paths);
141 cout << SAVING_MESSAGE <<
'\n';
143 save_duplications(file_duplication_pairs);
void execute()
Executes the preprocessing pipeline.
DuplicationFinderDiff(string base_path_, double similarity_)
Constructs preprocessor with configuration.
Code duplication preprocessing system.
bool is_regular_file(string path)
Determines if a path refers to a regular file.
bool does_file_exist(string file_path)
Checks if a file exists at the given path.
bool is_empty_char(char c)
Checks if a character is considered empty/whitespace.
vector< string > read_file_generic(string string_path)
Reads a file line by line into a vector of strings.