Skip to content

Commit

Permalink
Enforcing sorting by file name before processing.
Browse files Browse the repository at this point in the history
  • Loading branch information
schuemie committed Apr 16, 2015
1 parent 42e823e commit 1ad758a
Showing 1 changed file with 37 additions and 25 deletions.
62 changes: 37 additions & 25 deletions src/org/ohdsi/medline/xmlToDatabase/XMLFileIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPInputStream;
Expand All @@ -35,33 +37,43 @@

/**
* Iterates over all xml.gz files in a specified folder, decompressing and parsing them in a separate thread.
*
* @author Schuemie
*
*
*/
public class XMLFileIterator implements Iterator<Document> {

private Iterator<File> fileIterator;
private DecompressAndParseThread decompressAndParseThread = new DecompressAndParseThread();
private boolean hasNext = true;

private Iterator<File> fileIterator;
private DecompressAndParseThread decompressAndParseThread = new DecompressAndParseThread();
private boolean hasNext = true;
/**
* @param folder Specifies the absolute path to the folder containing the xml files
* @param folder
* Specifies the absolute path to the folder containing the xml files
*/
public XMLFileIterator(String folder){
this(folder,Integer.MAX_VALUE);
public XMLFileIterator(String folder) {
this(folder, Integer.MAX_VALUE);
}

/**
*
* @param folder Specifies the absolute path to the folder containing the xml files
* @param sampleSize Specifies the maximum number of files that is randomly sampled
*/
public XMLFileIterator(String folder, int sampleSize){
/**
*
* @param folder
* Specifies the absolute path to the folder containing the xml files
* @param sampleSize
* Specifies the maximum number of files that is randomly sampled
*/
public XMLFileIterator(String folder, int sampleSize) {
List<File> files = new ArrayList<File>();
for (File file : new File(folder).listFiles())
if (file.getAbsolutePath().endsWith("xml.gz"))
if (file.getAbsolutePath().endsWith("xml.gz"))
files.add(file);
files = RandomUtilities.sampleWithoutReplacement(files, sampleSize);
Collections.sort(files, new Comparator<File>() {
@Override
public int compare(File o1, File o2) {
return o1.getName().compareTo(o2.getName());
}
});
fileIterator = files.iterator();
if (fileIterator.hasNext())
decompressAndParseThread.startProcessing(fileIterator.next());
Expand All @@ -70,12 +82,12 @@ public XMLFileIterator(String folder, int sampleSize){
decompressAndParseThread.terminate();
}
}

@Override
public boolean hasNext() {
return hasNext;
}

@Override
public Document next() {
decompressAndParseThread.waitUntilFinished();
Expand All @@ -88,33 +100,33 @@ public Document next() {
}
return document;
}

@Override
public void remove() {
throw new RuntimeException("Calling unimplemented method remove in " + this.getClass().getName());
}

private class DecompressAndParseThread extends BatchProcessingThread {

private File file;
private Document document;
private File file;
private Document document;

public void startProcessing(File file){
public void startProcessing(File file) {
this.file = file;
proceed();
}

public Document getDocument(){
public Document getDocument() {
return document;
}

@Override
protected void process() {
System.out.println("Processing " + file.getName());
try {
FileInputStream fileInputStream = new FileInputStream(file);
GZIPInputStream gzipInputStream = new GZIPInputStream(fileInputStream);
DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
document = builder.parse(gzipInputStream);
} catch (FileNotFoundException e) {
e.printStackTrace();
Expand Down

0 comments on commit 1ad758a

Please sign in to comment.