Skip to content

Commit

Permalink
revise table validation, apply check to figures, move code outside th…
Browse files Browse the repository at this point in the history
…e fulltext parser
  • Loading branch information
lfoppiano committed Dec 6, 2024
1 parent d95a1ac commit b012665
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 211 deletions.
17 changes: 16 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ public boolean apply(GraphicObject graphicObject) {
private List<BoundingBox> textArea;
private List<LayoutToken> layoutTokens;

// Contains the raw layoutTokens from the fulltext model
private List<LayoutToken> rawLayoutTokens = new ArrayList<>();

// coordinates
private int page = -1;
private double y = 0.0;
Expand Down Expand Up @@ -323,8 +326,12 @@ public String getTeiId() {
return "fig_" + this.id;
}

public boolean isCompleteForTEI() {
return (StringUtils.isAllBlank(header) || StringUtils.isNotEmpty(caption) || CollectionUtils.isNotEmpty(graphicObjects));
}

public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption) && CollectionUtils.isEmpty(graphicObjects)) {
if (isCompleteForTEI()) {
return null;
}
Element figureElement = XmlBuilderUtils.teiElement("figure");
Expand Down Expand Up @@ -568,4 +575,12 @@ public void setLabel(StringBuilder label) {
public void setUri(URI uri) {
this.uri = uri;
}

public List<LayoutToken> getRawLayoutTokens() {
return rawLayoutTokens;
}

public void setRawLayoutTokens(List<LayoutToken> rawLayoutTokens) {
this.rawLayoutTokens = rawLayoutTokens;
}
}
34 changes: 17 additions & 17 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import nu.xom.Attribute;
import nu.xom.Element;
import nu.xom.Node;
import nu.xom.Text;

import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
Expand All @@ -44,8 +43,6 @@ public class Table extends Figure {
private List<LayoutToken> contentTokens = new ArrayList<>();
private List<LayoutToken> fullDescriptionTokens = new ArrayList<>();

// Contains the raw layoutTokens from the fulltext model
private List<LayoutToken> rawLayoutTokens = new ArrayList<>();
private boolean goodTable = true;

private StringBuilder note = null;
Expand All @@ -65,9 +62,13 @@ public Table() {
note = new StringBuilder();
}

public boolean isCompleteForTEI() {
return (StringUtils.isNotEmpty(header) && StringUtils.isNotEmpty(caption));
}

@Override
public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter formatter, List<MarkerType> markerTypes) {
if (StringUtils.isEmpty(header) && StringUtils.isEmpty(caption)) {
if (!isCompleteForTEI()) {
return null;
}

Expand Down Expand Up @@ -107,7 +108,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
addXmlId(desc, "_" + divID);
}

if ( (labeledCaption != null) && (labeledCaption.length() > 0) ) {
if (StringUtils.isNotBlank(labeledCaption)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledCaption, captionLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();
for (TaggingTokenCluster cluster : clusters) {
Expand Down Expand Up @@ -172,15 +173,15 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

Element noteNode = null;
if (note != null && note.toString().trim().length()>0) {
if (StringUtils.isNotBlank(note)) {

noteNode = XmlBuilderUtils.teiElement("note");
if (config.isGenerateTeiIds()) {
String divID = KeyGen.getKey().substring(0, 7);
addXmlId(noteNode, "_" + divID);
}

if ( (labeledNote != null) && (labeledNote.length() > 0) ) {
if (StringUtils.isNotBlank(labeledNote)) {
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, labeledNote, noteLayoutTokens);
List<TaggingTokenCluster> clusters = clusteror.cluster();
for (TaggingTokenCluster cluster : clusters) {
Expand Down Expand Up @@ -349,9 +350,14 @@ public String getLabeledNote() {
return this.labeledNote;
}

private boolean validateTable() {
/** Check if the table:
* - has label, header and content
* - header starts with "tab"
* - label can be parsed
*/
public boolean validateTable() {
CntManager cnt = Engine.getCntManager();
if (StringUtils.isEmpty(label) || StringUtils.isEmpty(header) || StringUtils.isEmpty(content)) {
if (StringUtils.isAnyBlank(label, header, content)) {
cnt.i(TableRejectionCounters.EMPTY_LABEL_OR_HEADER_OR_CONTENT);
return false;
}
Expand All @@ -362,7 +368,8 @@ private boolean validateTable() {
cnt.i(TableRejectionCounters.CANNOT_PARSE_LABEL_TO_INT);
return false;
}
if (!getHeader().toLowerCase().startsWith("table")) {
// tab covers: table, tabelle, tableu, tabella, etc.
if (!StringUtils.startsWithIgnoreCase(getHeader(), "tab")) {
cnt.i(TableRejectionCounters.HEADER_NOT_STARTS_WITH_TABLE_WORD);
return false;
}
Expand Down Expand Up @@ -427,11 +434,4 @@ public String getTeiId() {
return "tab_" + this.id;
}

public List<LayoutToken> getRawLayoutTokens() {
return rawLayoutTokens;
}

public void setRawLayoutTokens(List<LayoutToken> rawLayoutTokens) {
this.rawLayoutTokens = rawLayoutTokens;
}
}
Loading

0 comments on commit b012665

Please sign in to comment.