Skip to content

Commit

Permalink
Use ArrayList instead of LinkedList
Browse files Browse the repository at this point in the history
The paragraph neighbor processing is based entirely on List.get(),
making a LinkedList a very inefficient choice.
  • Loading branch information
tfmorris committed Apr 15, 2016
1 parent cc6291c commit b377e49
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
Expand Down Expand Up @@ -113,7 +113,7 @@ private Document convertHtmlToDoc(String html)
* Initialize the Paragraph explorer class in order to convert a document to
* a list of blocks (paragraphs)
*/
private LinkedList<Paragraph> makeParagraphs(Node node)
private ArrayList<Paragraph> makeParagraphs(Node node)
{
ParagraphsExplorer pe = new ParagraphsExplorer();
node.traverse(pe); //begin the traversal of the doc
Expand Down Expand Up @@ -274,12 +274,11 @@ private PARAGRAPH_TYPE getNextNeighbourOptimized(int i, List<Paragraph> paragrap
* <li>postprocessing of header blocks
* </ol>
*
* FIXME: This can behave pathologically in the presence of large lists of "paragraphs"
* with no textual content. In this case the maxHeadingDistance parameter isn't adequate
* to short circuit large amounts of processing. We may need to max number of elements
* to search (10? 20?).
* NOTE: Normally we'd use List<Paragraph> in the definition, but this method makes extensive
* use of List.get() which is very inefficient with other list implementations such as LinkedList.
* This could be rewritten to use ListIterators to generalize it, but I don't see the point.
*/
private void reclassifyContextSensitive(List<Paragraph> paragraphs, int maxHeadingDistance)
private void reclassifyContextSensitive(ArrayList<Paragraph> paragraphs, int maxHeadingDistance)
{
// Default classification is the same as the context-free classification
for (Paragraph p : paragraphs) {
Expand Down Expand Up @@ -408,7 +407,7 @@ private List<Paragraph> classify(String htmlText, Set<String> stopwordsSet, int
}

Document jSoupDoc = convertHtmlToDoc(htmlText);
LinkedList<Paragraph> paragraphs = makeParagraphs(jSoupDoc);
ArrayList<Paragraph> paragraphs = makeParagraphs(jSoupDoc);
//context-free classification
classifyContextFree(paragraphs, stopwordsSet, lengthLow, lengthHigh,
stopwordsLow, stopwordsHigh, maxLinkDensity);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
import java.util.regex.Pattern;

Expand All @@ -48,7 +48,7 @@ public class ParagraphsExplorer
new HashSet<String>(Arrays.asList(new String[] { "blockquote", "caption", "center", "col", "colgroup", "dd",
"div", "dl", "dt", "fieldset", "form", "legend", "optgroup", "option", "p", "pre", "table", "td",
"textarea", "tfoot", "th", "thead", "tr", "ul", "li", "h1", "h2", "h3", "h4", "h5", "h6" })));
private final LinkedList<Paragraph> paragraphs;
private final ArrayList<Paragraph> paragraphs;
private Paragraph currentParagraph = null;
private boolean lastBR = false;
private boolean inHeading = false;
Expand All @@ -62,7 +62,7 @@ public enum AncestorState

public ParagraphsExplorer()
{
this.paragraphs = new LinkedList<>();
this.paragraphs = new ArrayList<>();
}

@Override
Expand Down Expand Up @@ -106,7 +106,7 @@ public void tail(Node node, int depth)
*
* @return paragraphs
*/
public LinkedList<Paragraph> getParagraphs()
public ArrayList<Paragraph> getParagraphs()
{
return paragraphs;
}
Expand Down

0 comments on commit b377e49

Please sign in to comment.