#include #include #include #include #include #include #include "concurrency.h" #include "dir.h" #include "levdist.h" #include "levenshtein.h" // Private functions: /// Shows how to travese the list removing elements void levdist_print_and_destroy_files(levdist_t* this); /// Shows how to traverse the list without removing elements void levdist_print_files(levdist_t* this); /// Creates an array with all the pending comparisons size_t levdist_prepare_comparisons(levdist_t* this); /// Calculates the distance for each comparison int levdist_calculate_comparisons(levdist_t* this); /// Calculates the Levenshtein distance and stores it into the comparison int levdist_compare(levdist_t* this, comparison_t* comparison); /// Calls the selected Levenshtein implementation size_t levdist_call_implementation(levdist_t* this, const void* contents1, size_t len1, const void* contents2, size_t len2); /// Sort comparisons from most similar to the least similar void levdist_sort_comparisons(levdist_t* this); /// Print each comparison and its Levenshtein distance void levdist_print_comparisons(levdist_t* this); /// Reads the entire contents of the file int levdist_read_contents(levdist_t* this, file_record_t* file_record, FILE* file); /// Reads the entire contents of an Unicode file /// @return The number of unicode chars read size_t levdist_read_contents_unicode(wchar_t* buffer, size_t length, FILE* file); void levdist_init(levdist_t* this) { arguments_init(&this->arguments); this->files = NULL; this->comparisons = NULL; } int levdist_run(levdist_t* this, int argc, char* argv[]) { // Analyze the arguments given by the user this->arguments = arguments_analyze(argc, argv); // If arguments are incorrect, stop if ( this->arguments.error ) return this->arguments.error; // If user asked for help or software version, print it and stop if ( this->arguments.help_asked ) return arguments_print_usage(); if ( this->arguments.version_asked ) return arguments_print_version(); // If user did not provided directories, stop if ( this->arguments.dir_count <= 0 ) return (void)fprintf(stderr, "levdist: error: no directories given\n"), 1; // If user asked for unicode, enable it if ( this->arguments.unicode ) setlocale(LC_ALL, ""); // Arguments seems fine, process the directories return levdist_process_dirs(this, argc, argv); } int levdist_process_dirs(levdist_t* this, int argc, char* argv[]) { // For error reporting int error = 0; // Start counting the time walltime_t total_start, compare_start, compare_finish; walltime_start(&total_start); // Load all files into a list this->files = queue_create(); levdist_list_files_in_args(this, argc, argv); // At least two files are required to make comparisons if ( queue_count(this->files) >= 2 ) { // Prepare the comparisons levdist_prepare_comparisons(this); // Do all comparisons and calculate its duration walltime_start(&compare_start); error = levdist_calculate_comparisons(this); walltime_start(&compare_finish); // If comparisons were success if ( error == 0 ) { // Sort comparisons from most similar to the least and print them levdist_sort_comparisons(this); levdist_print_comparisons(this); } } else { fprintf(stderr, "levdist: error: at least two files are required to compare\n"); error = 12; } // Release the memory used levdist_destroy(this); // Report elapsed time, if allowed if ( this->arguments.quiet == false && this->arguments.silent == false ) printf("Total time %.9lfs, comparing time %.9lfs, %zu workers\n" , walltime_elapsed_from(&total_start) , walltime_elapsed_between(&compare_start, &compare_finish) , this->arguments.workers); return error; } int levdist_list_files_in_args(levdist_t* this, int argc, char* argv[]) { // Traverse all arguments for ( int current = 1; current < argc; ++current ) { // Skip command-line options const char* arg = argv[current]; if ( *arg == '-' ) continue; // If argument is a regular file, add it to the queue // otherwise, consider it a directory and scan it if ( dir_is_file(arg) ) { int error = levdist_load_file(this, arg, true); if ( error ) return error; } else dir_list_files_in_dir(this, arg, this->arguments.recursive); } return 0; } int levdist_load_file(levdist_t* this, const char* filename, bool load_contents) { // To report errors int error = 0; // If file is already enqueued, do not compare it against itself if ( levdist_is_in_queue(this, filename) ) return 0; // Try opening the file FILE* file = fopen(filename, "rb"); if ( file == NULL ) return (void)fprintf(stderr, "levdist: error: could not open %s\n", filename), 11; // The file was open, create a record for it and init it file_record_t* file_record = (file_record_t*) calloc( 1, sizeof(file_record_t) ); file_record->filename = strdup(filename); // Get its size using the file system information file_record->size = dir_file_size(filename); // Load file contents, if asked if ( load_contents && (error = levdist_read_contents(this, file_record, file)) ) return (void)fclose(file), error; // Success, add the file record to the queue queue_append( this->files, file_record ); fclose(file); return error; } bool levdist_is_in_queue(levdist_t* this, const char* filename) { for ( queue_iterator_t itr = queue_begin(this->files); itr != queue_end(this->files); itr = queue_next(itr) ) if ( strcmp(((file_record_t*)queue_data(itr))->filename, filename) == 0 ) return true; return false; } int levdist_read_contents(levdist_t* this, file_record_t* file_record, FILE* file) { // If file has no contents, it is an empty file, stop if ( file_record->size <= 0 ) return 0; // The len is one char more than the size of the file (for storing '\0') size_t capacity = (size_t)file_record->size + 1; size_t char_size = this->arguments.unicode ? sizeof(wchar_t) : sizeof(char); // Allocate dynamic memory for contents, this is risky file_record->contents = malloc( capacity * char_size ); if ( file_record->contents == NULL ) return (void)fprintf(stderr, "levdist: error: not enough memory to load %s\n", file_record->filename), 21; // We have memory to load the file. For unicode we read char-by-char if ( this->arguments.unicode ) { // If we can read the entire file, return success size_t actual_size = levdist_read_contents_unicode( (wchar_t*)file_record->contents, capacity - 1, file); if ( actual_size != (size_t)-1 ) return (void)(file_record->size = (off_t)actual_size), 0; } // Read the entire fire in ASCII mode, if it is read return success else if ( fread( file_record->contents, (size_t)file_record->size, 1, file ) == 1 ) return 0; // The read failed, report it and free the memory free( file_record->contents ); file_record->contents = NULL; fprintf(stderr, "levdist: error: could not read %s\n", file_record->filename); return 22; } size_t levdist_read_contents_unicode(wchar_t* buffer, size_t length, FILE* file) { // Read unicode chars and store them in buffer, until find EOF or overflow the buffer size_t index = 0; wint_t ch = 0; while ( index <= length && (ch = fgetwc(file)) != WEOF ) buffer[index++] = ch; // Always set the end-of-string marker buffer[index - (index > length)] = L'\0'; // Success if we read the entire file, that is, including the EOF assert(ch == WEOF); return ch == WEOF ? index : (size_t)-1; } void levdist_print_and_destroy_files(levdist_t* this) { long count = 0; while ( ! queue_is_empty(this->files) ) { char* filename = (char*)queue_pop(this->files); if ( this->arguments.silent == false ) fprintf(stdout, "%ld: %s\n", ++count, filename); free(filename); } } void levdist_print_files(levdist_t* this) { long count = 0; for ( queue_iterator_t itr = queue_begin(this->files); itr != queue_end(this->files); itr = queue_next(itr) ) { const char* filename = (const char*)queue_data(itr); if ( this->arguments.silent == false ) fprintf(stdout, "%ld: %s\n", ++count, filename); } } // todo: Make public size_t levdist_count_comparisons(levdist_t* this) { // The number of files must be greater than zero size_t file_count = queue_count(this->files); if ( file_count == 0 ) return 0; // Calculate the number of comparison using Gauss return file_count * (file_count - 1) / 2; } size_t levdist_prepare_comparisons(levdist_t* this) { // Get the number of comparison using Gauss size_t comp_count = levdist_count_comparisons(this); // Allocate memory for all comparisons this->comparisons = (comparison_t*) malloc(comp_count * sizeof(comparison_t)); assert(this->comparisons); if ( this->comparisons == NULL ) return 0; // Fill the array traversing all filenames in the queue size_t count = 0; for ( queue_iterator_t itr1 = queue_begin(this->files); itr1 != queue_end(this->files); itr1 = queue_next(itr1) ) { // Match this filename with the remaining ones for ( queue_iterator_t itr2 = queue_next(itr1); itr2 != queue_end(this->files); itr2 = queue_next(itr2) ) { // Get the filenames file_record_t* file1 = (file_record_t*)queue_data(itr1); file_record_t* file2 = (file_record_t*)queue_data(itr2); // Copy the filenames in alpabetical order bool ordered = strcmp(file1->filename, file2->filename) < 0; this->comparisons[count].file1 = ordered ? file1 : file2; this->comparisons[count].file2 = ordered ? file2 : file1; // The distance has not been calculated yet this->comparisons[count].distance = (size_t)-1; // Move to the next comparison ++count; } } assert(count == comp_count); return comp_count; } int levdist_calculate_comparisons(levdist_t* this) { // For error reporting int error = 0; // Get the number of comparison using Gauss size_t comp_count = levdist_count_comparisons(this); // Traverse all comparisons and calculate their distance for ( size_t index = 0; index < comp_count; ++index ) if ( (error = levdist_compare( this, &this->comparisons[index] ) ) ) return error; // Done return error; } int levdist_compare(levdist_t* this, comparison_t* comparison) { // Get the names of the files being compared const char* filename1 = comparison->file1->filename; const char* filename2 = comparison->file2->filename; // Get the contents of the files being compared const void* contents1 = comparison->file1->contents; const void* contents2 = comparison->file2->contents; // Get the sizes of the files being compared size_t len1 = (size_t)comparison->file1->size; size_t len2 = (size_t)comparison->file2->size; // If files are not empty compare their contents using the Levenshtein distance algorithm. // If at least one file is empty, their distance is the max length of their contents if ( contents1 && contents2 ) comparison->distance = levdist_call_implementation(this, contents1, len1, contents2, len2); else comparison->distance = len1 + len2; // If success, return 0 if ( comparison->distance != (size_t)-1 ) return 0; // Report the fail fprintf(stderr, "levdist: error: could not compare %s and %s: not enough memory\n", filename1, filename2); return 41; } size_t levdist_call_implementation(levdist_t* this, const void* contents1, size_t len1, const void* contents2, size_t len2) { switch ( this->arguments.lev_impl ) { case lev_serial: this->arguments.workers = 1; return levenshtein_distance_serial(contents1, len1, contents2, len2, this->arguments.unicode); case lev_pthreads_byrows: return levenshtein_distance_pthreads_byrows(contents1, len1, contents2, len2, this->arguments.unicode, &this->arguments.workers); case lev_pthreads_guo: return levenshtein_distance_pthreads_guo(contents1, len1, contents2, len2, this->arguments.unicode, &this->arguments.workers); case lev_pthreads_diagonal: return levenshtein_distance_pthreads_diagonal(contents1, len1, contents2, len2, this->arguments.unicode, &this->arguments.workers); } return (size_t) -1; } int compare_comparison(const void* comparison1, const void* comparison2) { // Short alias const comparison_t* comp1 = (const comparison_t*)comparison1; const comparison_t* comp2 = (const comparison_t*)comparison2; // Get -1 if c1 < c2, 0 if c1 == c2, and 1 if c1 > c2 int order = (comp1->distance > comp2->distance) - (comp1->distance < comp2->distance); if ( order ) return order; // The files have the same distance, compare by filename if ( ( order = strcmp(comp1->file1->filename, comp2->file1->filename) ) ) return order; return strcmp(comp1->file2->filename, comp2->file2->filename); } void levdist_sort_comparisons(levdist_t* this) { // Get the number of comparison using Gauss size_t comp_count = levdist_count_comparisons(this); // Sort them using the quick sort algorithm qsort( this->comparisons, comp_count, sizeof(comparison_t), compare_comparison); } void levdist_print_comparisons(levdist_t* this) { // Get the number of comparison using Gauss size_t comp_count = levdist_count_comparisons(this); // Print all comparisons if ( this->arguments.silent == false ) for ( size_t index = 0; index < comp_count; ++index ) fprintf(stdout, "%ld\t%s\t%s\n" , this->comparisons[index].distance , this->comparisons[index].file1->filename , this->comparisons[index].file2->filename); } void levdist_destroy(levdist_t* this) { // Comparisons only have pointers to the files and their distances free(this->comparisons); // Each file record in the queue must be released manually while ( ! queue_is_empty(this->files) ) { // Get the file record file_record_t* file_record = queue_pop(this->files); // The file record allocates a filename and contents in heap free(file_record->filename); free(file_record->contents); // Release the file record itself free(file_record); } // Destroy the queue itself queue_destroy(this->files, true); }