Program must to find duplicate files (by content) in a given directory(and subdirectories).
I collect all data in Map<Long, ArrayList<String>> map where Key is the size of file and Value is the List of paths to files with the same size.
public static void main(String[] args) {
long start = System.currentTimeMillis();
new FileScanner("/").searchFiles();
System.out.println(System.currentTimeMillis() - start);
}
I tested program on root directory (Linux) / where total count of files is: 281091. The time is of scanning: 3131064 milliseconds. In my opinion it's may be more faster.
/boot/grub/biosdisk.mod
/usr/lib/grub/i386-pc/biosdisk.mod
/usr/lib/ruby/vendor_ruby/1.8/rubygems/command_manager.rb
/usr/lib/ruby/1.9.1/rubygems/command_manager.rb
/usr/share/pixmaps/openjdk-7.xpm
/usr/share/app-install/icons/_usr_share_icons_sun-java5.xpm
...
Also the program make log/files.log files where outputting paths of files the same contents separated groups on one blank line.
import org.apache.log4j.Logger;
import java.io.*;
import java.util.*;
import java.nio.file.Files;
public class FileScanner {
private String path, canonPath;
private static final int BUFFER_SIZE_SMALL = 1024; // 1024 byte
private static final int BUFFER_SIZE_MEDIUM = 1048576; // 1 mb
private static final int BUFFER_SIZE_BIG = 10485760; // 10 mb
private static Logger log = Logger.getLogger(FileScanner.class);
/*
* Data structure where keys is a size of file and
* value is list of canonical path to files the same size
*/
private Map<Long, ArrayList<String>> mapFiles;
/*
* Constructor using the specified path
*/
public FileScanner(String path) {
this.path = path;
mapFiles = new HashMap<>();
}
/*
* Constructor with the specified initial capacity
*/
public FileScanner(String path, int capacity) {
this.path = path;
mapFiles = new HashMap<>(capacity);
}
/*
* Getter and Setter for path
*/
String getPath() {
return path;
}
void setPath(String path) {
this.path = path;
}
/*
* Get canonical path from File
*/
private String toCanonicalPath(File file) {
try {
canonPath = file.getCanonicalPath();
} catch (IOException e) {
e.printStackTrace();
}
return canonPath;
}
/*
* Get an input stream that reads bytes from a file
*/
protected InputStream getInputStream(File file) throws FileNotFoundException {
return new BufferedInputStream(new FileInputStream(file));
}
/*
* Define buffer size by file length
*/
protected int defineBufferLength(long length) {
if (length < BUFFER_SIZE_MEDIUM) // file size less than 1mb
return BUFFER_SIZE_SMALL; // 1bt
if (length < BUFFER_SIZE_BIG) // file size less than 10mb
return BUFFER_SIZE_SMALL * 10; // 10bt
if (length < BUFFER_SIZE_BIG * 10) // file size less than 100mb
return BUFFER_SIZE_MEDIUM; // 1mb
if (length < BUFFER_SIZE_BIG * 100) // file size less than 1gb
return BUFFER_SIZE_BIG; // 10mb
return BUFFER_SIZE_BIG * 10; // 100mb
}
/*
* Search similar files by length in the directory and subdirectories
*/
private void scanner(String path) {
File[] subDirs = new File(path).listFiles(new FileFilter() {
@Override
public boolean accept(final File file) {
if (file.isFile() && file.canRead()) {
long size = file.length(); // length of the file is a key in map
if (mapFiles.containsKey(size))
mapFiles.get(size).add(toCanonicalPath(file));
else mapFiles.put(size, new ArrayList<String>(25) {{
add(toCanonicalPath(file));
}});
return false;
}
return file.isDirectory() && file.canRead() && !Files.isSymbolicLink(file.toPath());
}
});
for (int i = 0; i < subDirs.length; i++)
scanner(toCanonicalPath(subDirs[i]));
}
/*
* Compare binary files
*/
protected boolean compareFiles(String path1, String path2) {
if (path1.equals(path2)) return false;
boolean isSimilar = true;
final File f1 = new File(path1), f2 = new File(path2);
int size = defineBufferLength(f1.length());
byte[] bytesF1 = new byte[size], bytesF2 = new byte[size];
try (InputStream in1 = getInputStream(f1); InputStream in2 = getInputStream(f2)) {
while (in1.read(bytesF1) != -1 && in2.read(bytesF2) != -1) {
if (!Arrays.equals(bytesF1, bytesF2)) {
isSimilar = false;
break;
}
}
} catch (IOException e) {
log.error("Error:", e);
}
return isSimilar;
}
public void searchFiles() {
scanner(path);
for (ArrayList<String> paths : mapFiles.values()) {
if (paths.size() == 1) continue;
for (int i = 0; i < paths.size(); i++) {
String path1 = paths.get(i);
boolean isFound = false;
for (int j = 0; j < paths.size(); j++) {
String path2 = paths.get(j);
if (compareFiles(path1, path2)) {
log.info(path2);
paths.remove(path2);
isFound = true;
}
}
if (isFound) log.info(path1 + "\n");
paths.remove(path1);
}
}
}
}
How I can optimize any pieces of code or algorithm to be as fast as possible ?