diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java index ef4117e93c..e237c09e83 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Figure.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Figure.java @@ -88,6 +88,9 @@ public boolean apply(GraphicObject graphicObject) { private List textArea; private List layoutTokens; + // Contains the raw layoutTokens from the fulltext model + private List rawLayoutTokens = new ArrayList<>(); + // coordinates private int page = -1; private double y = 0.0; @@ -323,8 +326,12 @@ public String getTeiId() { return "fig_" + this.id; } + public boolean isCompleteForTEI() { + return (StringUtils.isAllBlank(header) || StringUtils.isNotEmpty(caption) || CollectionUtils.isNotEmpty(graphicObjects)); + } + public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List markerTypes) { - if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption) && CollectionUtils.isEmpty(graphicObjects)) { + if (isCompleteForTEI()) { return null; } Element figureElement = XmlBuilderUtils.teiElement("figure"); @@ -568,4 +575,12 @@ public void setLabel(StringBuilder label) { public void setUri(URI uri) { this.uri = uri; } + + public List getRawLayoutTokens() { + return rawLayoutTokens; + } + + public void setRawLayoutTokens(List rawLayoutTokens) { + this.rawLayoutTokens = rawLayoutTokens; + } } diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java index 0764796b38..1016760284 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Table.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Table.java @@ -30,7 +30,6 @@ import nu.xom.Attribute; import nu.xom.Element; import nu.xom.Node; -import nu.xom.Text; import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId; @@ -44,8 +43,6 @@ public class Table extends Figure { private List contentTokens = new ArrayList<>(); private List fullDescriptionTokens = new ArrayList<>(); - // Contains the raw layoutTokens from the fulltext model - private List rawLayoutTokens = new ArrayList<>(); private boolean goodTable = true; private StringBuilder note = null; @@ -65,9 +62,13 @@ public Table() { note = new StringBuilder(); } + public boolean isCompleteForTEI() { + return (StringUtils.isNotEmpty(header) && StringUtils.isNotEmpty(caption)); + } + @Override public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List markerTypes) { - if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption)) { + if (!isCompleteForTEI()) { return null; } @@ -107,7 +108,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form addXmlId(desc, "_" + divID); } - if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) { + if (StringUtils.isNotBlank(labeledCaption)) { TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens); List clusters = clusteror.cluster(); for (TaggingTokenCluster cluster : clusters) { @@ -172,7 +173,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form } Element noteNode = null; - if (note != null && note.toString().trim().length()>0) { + if (StringUtils.isNotBlank(note)) { noteNode = XmlBuilderUtils.teiElement("note"); if (config.isGenerateTeiIds()) { @@ -180,7 +181,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form addXmlId(noteNode, "_" + divID); } - if ( (labeledNote != null) && (labeledNote.length() > 0) ) { + if (StringUtils.isNotBlank(labeledNote)) { TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledNote, noteLayoutTokens); List clusters = clusteror.cluster(); for (TaggingTokenCluster cluster : clusters) { @@ -349,9 +350,14 @@ public String getLabeledNote() { return this.labeledNote; } - private boolean validateTable() { + /** Check if the table: + * - has label, header and content + * - header starts with "tab" + * - label can be parsed + */ + public boolean validateTable() { CntManager cnt = Engine.getCntManager(); - if (StringUtils.isEmpty(label) || StringUtils.isEmpty(header) || StringUtils.isEmpty(content)) { + if (StringUtils.isAnyBlank(label, header, content)) { cnt.i(TableRejectionCounters.EMPTY_LABEL_OR_HEADER_OR_CONTENT); return false; } @@ -362,7 +368,8 @@ private boolean validateTable() { cnt.i(TableRejectionCounters.CANNOT_PARSE_LABEL_TO_INT); return false; } - if (!getHeader().toLowerCase().startsWith("table")) { + // tab covers: table, tabelle, tableu, tabella, etc. + if (!StringUtils.startsWithIgnoreCase(getHeader(), "tab")) { cnt.i(TableRejectionCounters.HEADER_NOT_STARTS_WITH_TABLE_WORD); return false; } @@ -427,11 +434,4 @@ public String getTeiId() { return "tab_" + this.id; } - public List getRawLayoutTokens() { - return rawLayoutTokens; - } - - public void setRawLayoutTokens(List rawLayoutTokens) { - this.rawLayoutTokens = rawLayoutTokens; - } } \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 36e1065313..23b39aee82 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -37,12 +37,7 @@ import org.grobid.core.layout.*; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.utilities.LanguageUtilities; -import org.grobid.core.utilities.TextUtilities; -import org.grobid.core.utilities.KeyGen; -import org.grobid.core.utilities.LayoutTokensUtil; -import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.Consolidation; +import org.grobid.core.utilities.*; import org.grobid.core.utilities.matching.ReferenceMarkerMatcher; import org.grobid.core.utilities.matching.EntityMatcherException; import org.grobid.core.engines.citations.CalloutAnalyzer; @@ -72,6 +67,7 @@ import nu.xom.Element; import static org.apache.commons.lang3.StringUtils.*; +import static org.grobid.core.utilities.LabelUtils.postProcessFullTextLabeledText; public class FullTextParser extends AbstractParser { private static final Logger LOGGER = LoggerFactory.getLogger(FullTextParser.class); @@ -260,7 +256,7 @@ else if (config.getConsolidateCitations() == 2) List tables = null; List equations = null; if (featSeg != null && isNotBlank(featSeg.getLeft())) { - // if featSeg is null, it usually means that no body segment is found in the + // if featSeg is null, it usually means that the fulltext body is not found in the // document segmentation String bodytext = featSeg.getLeft(); layoutTokenization = featSeg.getRight(); @@ -269,7 +265,7 @@ else if (config.getConsolidateCitations() == 2) resultBody = label(bodytext); //Correct subsequent I-
or I-
- resultBody = adjustInvalidSequenceOfStartLabels(resultBody); + resultBody = LabelUtils.adjustInvalidSequenceOfStartLabels(resultBody); // we apply now the figure and table models based on the fulltext labeled output figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc); @@ -282,6 +278,17 @@ else if (config.getConsolidateCitations() == 2) } } + List
badFigures = figures.stream() + .filter(f -> !f.isCompleteForTEI()) + .collect(Collectors.toList()); + + LOGGER.warn("Identified bad figures: " + badFigures.size()); + resultBody = revertResultsForBadItems(badFigures, resultBody, TaggingLabels.FIGURE_LABEL); + + figures = figures.stream() + .filter(f -> !badFigures.contains(f)) + .collect(Collectors.toList()); + tables = processTables(resultBody, layoutTokenization.getTokenization(), doc); //We deal with tables considered bad by reverting them as , to reduce the risk them to be @@ -290,117 +297,14 @@ else if (config.getConsolidateCitations() == 2) //TODO: double check the way the tables are validated List
badTables = tables.stream() - .filter(t -> !t.isGoodTable()) + .filter(t -> !(t.isCompleteForTEI() && t.validateTable())) .collect(Collectors.toList()); - //LF: we update the resultBody sequence by reverting these tables as elements - if (CollectionUtils.isNotEmpty(badTables)) { - List> splitResult = Arrays.stream(resultBody.split("\n")) - .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) - .collect(Collectors.toList()); - - for (Table badTable : badTables) { - // Find the index of the first layoutToken of the table in the tokenization - List rawLayoutTokenTable = badTable.getRawLayoutTokens(); - LayoutToken firstLayoutTokenTable = rawLayoutTokenTable.get(0); - - final List documentTokenization = layoutTokenization.getTokenization(); - - int tokenIndex = IntStream.range(0, documentTokenization.size()) - .filter(i -> { - LayoutToken l = documentTokenization.get(i); - return l.getText().equals(firstLayoutTokenTable.getText()) - && l.getPage() == firstLayoutTokenTable.getPage() - && l.getOffset() == firstLayoutTokenTable.getOffset(); - }) - .findFirst() - .orElse(-1); - - System.out.println(tokenIndex); - - List candidateIndexes = IntStream.range(0, splitResult.size()) - .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) - && Iterables.getLast(splitResult.get(i)).equals("I-
")) - .boxed() - .collect(Collectors.toList()); - - if (candidateIndexes.isEmpty()) { - candidateIndexes = IntStream.range(0, splitResult.size()) - .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) - && Iterables.getLast(splitResult.get(i)).equals("
")) - .boxed() - .collect(Collectors.toList()); - if (candidateIndexes.isEmpty()) { - LOGGER.info("Cannot find the candidate index for fixing the tables."); - continue; - } - } - - // Need to match with the rest - List tokensNoSpace = rawLayoutTokenTable.stream() - .map(LayoutToken::getText) - .map(StringUtils::strip) - .filter(StringUtils::isNotBlank) - .collect(Collectors.toList()); - - int resultIndexCandidate = -1; - if (tokensNoSpace.size() == 1){ - resultIndexCandidate = candidateIndexes.get(0); - } else { - for (int candidateIndex: candidateIndexes) { - List candidateTable = splitResult.subList(candidateIndex, candidateIndex + tokensNoSpace.size()) - .stream() - .map(i -> i.get(0)) - .collect(Collectors.toList()); - - String candidateTableText = String.join("", candidateTable); - String tokensText = String.join("", tokensNoSpace); - - if (candidateTableText.equals(tokensText)) { - resultIndexCandidate = candidateIndex; - break; - } - } - } - - if (resultIndexCandidate > -1) { - boolean first = true; - for (int i = resultIndexCandidate;i < resultIndexCandidate + tokensNoSpace.size(); i++) { - List line = splitResult.get(i); - String label = Iterables.getLast(line); - if (first) { - first = false; - } else { - if (label.startsWith("I-")) { - break; - } - } - line.set(line.size() - 1, label.replace("
", "")); - } - } else { - System.out.println("Cannot find the result index candiate."); - } - - -// List> badTableResult = Arrays.stream(badTable.getRawLayoutTokens().stream() -// .map(LayoutToken::getText) -// .toArray(String[]::new)) -// .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) -// .collect(Collectors.toList()); -// - - } - - String resultBody2 = splitResult.stream() - .map(l -> String.join("\t", l)) - .collect(Collectors.joining("\n")); - - resultBody = resultBody2; - - } + LOGGER.warn("Identified bad tables: " + badTables.size()); + resultBody = revertResultsForBadItems(badTables, resultBody, TaggingLabels.TABLE_LABEL); tables = tables.stream() - .filter(Table::isGoodTable) + .filter(t-> !badTables.contains(t)) .collect(Collectors.toList()); // further parse the caption @@ -458,6 +362,109 @@ else if (config.getConsolidateCitations() == 2) } } + private static String revertResultsForBadItems(List badItems, String resultBody, String itemLabel) { + //LF: we update the resultBody sequence by reverting these tables as elements + if (CollectionUtils.isNotEmpty(badItems)) { + List> splitResult = Arrays.stream(resultBody.split("\n")) + .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) + .collect(Collectors.toList()); + + for (Figure badTable : badItems) { + // Find the index of the first layoutToken of the table in the tokenization + List rawLayoutTokenTable = badTable.getRawLayoutTokens(); + LayoutToken firstLayoutTokenTable = rawLayoutTokenTable.get(0); + +// final List documentTokenization = layoutTokenization.getTokenization(); + +// int tokenIndex = IntStream.range(0, documentTokenization.size()) +// .filter(i -> { +// LayoutToken l = documentTokenization.get(i); +// return l.getText().equals(firstLayoutTokenTable.getText()) +// && l.getPage() == firstLayoutTokenTable.getPage() +// && l.getOffset() == firstLayoutTokenTable.getOffset(); +// }) +// .findFirst() +// .orElse(-1); + + List candidateIndexes = IntStream.range(0, splitResult.size()) + .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) + && Iterables.getLast(splitResult.get(i)).equals("I-"+itemLabel)) + .boxed() + .collect(Collectors.toList()); + + if (candidateIndexes.isEmpty()) { + candidateIndexes = IntStream.range(0, splitResult.size()) + .filter(i -> splitResult.get(i).get(0).equals(firstLayoutTokenTable.getText()) + && Iterables.getLast(splitResult.get(i)).equals(itemLabel)) + .boxed() + .collect(Collectors.toList()); + if (candidateIndexes.isEmpty()) { + LOGGER.info("Cannot find the candidate index for fixing the tables."); + continue; + } + } + + // Need to match with the rest + List tokensNoSpace = rawLayoutTokenTable.stream() + .map(LayoutToken::getText) + .map(StringUtils::strip) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toList()); + + int resultIndexCandidate = -1; + if (tokensNoSpace.size() == 1){ + resultIndexCandidate = candidateIndexes.get(0); + } else { + for (int candidateIndex: candidateIndexes) { + List candidateTable = splitResult.subList(candidateIndex, candidateIndex + tokensNoSpace.size()) + .stream() + .map(i -> i.get(0)) + .collect(Collectors.toList()); + + String candidateTableText = String.join("", candidateTable); + String tokensText = String.join("", tokensNoSpace); + + if (candidateTableText.equals(tokensText)) { + resultIndexCandidate = candidateIndex; + break; + } + } + } + + if (resultIndexCandidate > -1) { + boolean first = true; + for (int i = resultIndexCandidate;i < resultIndexCandidate + tokensNoSpace.size(); i++) { + List line = splitResult.get(i); + String label = Iterables.getLast(line); + if (first) { + first = false; + } else { + if (label.startsWith("I-")) { + break; + } + } + line.set(line.size() - 1, label.replace(TaggingLabels.TABLE_LABEL, TaggingLabels.PARAGRAPH_LABEL)); + } + } else { + LOGGER.warn("Cannot find the result index candidate."); + } +// List> badTableResult = Arrays.stream(badTable.getRawLayoutTokens().stream() +// .map(LayoutToken::getText) +// .toArray(String[]::new)) +// .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) +// .collect(Collectors.toList()); +// + } + + String resultBody2 = splitResult.stream() + .map(l -> String.join("\t", l)) + .collect(Collectors.joining("\n")); + + resultBody = resultBody2; + } + return resultBody; + } + /** * Machine-learning recognition of full text structures limted to header and funding information. @@ -642,84 +649,6 @@ public Pair> processShort(List tokens, Do return Pair.of(res, layoutTokenization); } - /** - * Post-process text labeled by the fulltext model on chunks that are known to be text (no table, or figure) - * It converts table and figure labels to paragraph labels. - */ - protected static String postProcessFullTextLabeledText(String fulltextLabeledText) { - if (fulltextLabeledText == null) - return null; - StringBuilder result = new StringBuilder(); - - String[] lines = fulltextLabeledText.split("\n"); - String previousLabel = null; - for(int i=0; i getBodyTextFeatured(Document doc, SortedSet documentBodyParts) { if ((documentBodyParts == null) || (documentBodyParts.size() == 0)) { diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java b/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java new file mode 100644 index 0000000000..0cb0b8211a --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/utilities/LabelUtils.java @@ -0,0 +1,84 @@ +package org.grobid.core.utilities; + +import org.apache.commons.lang3.StringUtils; +import org.grobid.core.engines.label.TaggingLabels; + +public class LabelUtils { + /** + * Post-process text labeled by the fulltext model on chunks that are known to be text (no table, or figure) + * It converts table and figure labels to paragraph labels. + */ + public static String postProcessFullTextLabeledText(String fulltextLabeledText) { + if (fulltextLabeledText == null) + return null; + StringBuilder result = new StringBuilder(); + + String[] lines = fulltextLabeledText.split("\n"); + String previousLabel = null; + for(int i=0; i