I created this C++ program to recursively search directories for
redundant files larger than 100MB and delete all but the largest of
them.
I've tested it myself as a normal user on a few dummy files but am
quite apprehensive to run it as root to clean up my messy file
systems.
Any comments on the program, advice on streamlining it or any bugs
spotted, etc.?
A wishlist for the program is to only delete the smaller files if they
are also older, i.e. the largest file that is preserved must be newer
than all other files. If the smaller files are newer then the user is
warned/prompted.
Here's a general outline of how I'd do the job. I've included code to
interface to the file system in Win_find_files. As it stands, the
file_size_t and Win_find_files classes aren't portable (at all) but
_most_ of the rest should be (there's also the minor detail of the "c:
\\" in main, but that's mostly included for demonstration purposes
anyway.
#include <iostream>
#include <vector>
#include <algorithm>
#include <string>
#include <iterator>
#include <windows.h>
class file_time_t {
FILETIME ft;
public:
file_time_t(FILETIME const &t) : ft(t) {}
bool operator<(file_time_t const &other) const {
if (ft.dwHighDateTime < other.ft.dwHighDateTime)
return true;
if (other.ft.dwHighDateTime < ft.dwHighDateTime)
return false;
return ft.dwLowDateTime < other.ft.dwLowDateTime;
}
};
class file_size_t {
unsigned __int64 size_;
public:
file_size_t(unsigned high, unsigned low) {
size_ = high;
size_ <<= 32;
size_ |= low;
}
file_size_t(unsigned low) : size_(low) {}
operator unsigned __int64() const { return size_; }
};
std::string splice(std::string a, std::string const &b) {
if (a[a.size()-1] != '/')
a+= "/";
a+=b;
return a;
}
struct file {
std::string path_;
std::string name_;
file_time_t mod_date_;
file_size_t size_;
file(std::string path, std::string name, file_time_t mod_date,
file_size_t size)
: path_(path), name_(name), mod_date_(mod_date), size_(size)
{ }
bool operator<(file const &b) {
if (name_ < b.name_)
return true;
if (b.name_ < name_)
return false;
// the names are equal -- look at dates
if (mod_date_ < b.mod_date_)
return true;
if (b.mod_date_ < mod_date_)
return false;
// dates are equal -- look at sizes
return b.size_ < size_;
}
};
class Win_find_files {
file_size_t min_;
std::vector<file> &output_;
void enumerate(std::string const &dir) const {
WIN32_FIND_DATA data;
HANDLE finder;
std::string name =splice(dir,"*");
finder = FindFirstFile(name.c_str(), &data);
if (finder == INVALID_HANDLE_VALUE)
return;
do {
if (data.cFileName[0] == '.')
continue;
if (data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
enumerate(splice(dir, data.cFileName));
}
else {
file_size_t size(data.nFileSizeHigh, data.nFileSizeLow);
if (size > min_)
output_.push_back(file(dir, data.cFileName,
data.ftLastWriteTime, size));
}
} while (FindNextFile(finder, &data));
FindClose(finder);
}
public:
Win_find_files(file_size_t min_size, std::vector<file> &output)
: min_(min_size), output_(output)
{}
virtual void operator()(std::string const &start) const {
enumerate(start);
}
};
bool del_file(file const &a, file const &b) {
return !(a.size_ < b.size_) && b.mod_date_ < a.mod_date_;
}
bool warn(file const &a, file const &b) {
std::cout << "Possible duplicate:\n"
<< splice(b.path_, b.name_)
<< "\nmay be a duplicate of: \n"
<< splice(a.path_, a.name_);
std::cout << "\ndo you want to delete it?";
char ch;
std::cin >> ch;
return ch == 'y' || ch == 'Y';
}
int main() {
std::vector<file> files;
typedef std::vector<file> collection;
Win_find_files find(100*1024*1024, files);
find("c:/");
std::sort(files.begin(), files.end());
collection::iterator first = files.begin();
collection::iterator next = first+1;
while (next != files.end()) {
if (first->name_ != next->name_)
first = next;
else if (warn(*first, *next))
remove(splice(next->path_, next->name_).c_str());
++next;
}
}
It doesn't implement your specification precisely though -- instead of
generating a script to remove files, it always runs interactively and
removes files itself when/if you approve its doing so (but never removes
anything without asking).