From 5fb85a64f67b026e2e5d5156c94bbe2b67f46ac5 Mon Sep 17 00:00:00 2001
From: Thomas Rowlands <Thomas.s.Rowlands@gmail.com>
Date: Thu, 28 Nov 2024 18:21:29 +0000
Subject: [PATCH 1/2] Added new PMC config file for the October PMC website
 changes.

Moved config files to inside the autocorpus module.

Updated regression test to use an updated new directory structure.

Updated Readme to reflect directory changes.
---
 README.md                                     |   10 +-
 .../configs}/config_nature_genetics.json      |    0
 .../configs}/config_plos_genetics.json        |    0
 autocorpus/configs/config_pmc.json            |  197 ++
 .../configs/config_pmc_pre_oct_2024.json      |    0
 .../configs}/config_template.json             |    0
 tests/data/PMC/Current/PMC8885717.html        | 3003 +++++++++++++++++
 .../{ => PMC/Pre-Oct-2024}/PMC8885717.html    |    0
 .../PMC8885717_abbreviations.json             |    2 +-
 .../Pre-Oct-2024}/PMC8885717_bioc.json        |    2 +-
 .../Pre-Oct-2024}/PMC8885717_tables.json      |    8 +-
 tests/test_regression.py                      |   12 +-
 12 files changed, 3217 insertions(+), 17 deletions(-)
 rename {configs => autocorpus/configs}/config_nature_genetics.json (100%)
 rename {configs => autocorpus/configs}/config_plos_genetics.json (100%)
 create mode 100644 autocorpus/configs/config_pmc.json
 rename configs/config_pmc.json => autocorpus/configs/config_pmc_pre_oct_2024.json (100%)
 rename {configs => autocorpus/configs}/config_template.json (100%)
 create mode 100644 tests/data/PMC/Current/PMC8885717.html
 rename tests/data/{ => PMC/Pre-Oct-2024}/PMC8885717.html (100%)
 rename tests/data/{ => PMC/Pre-Oct-2024}/PMC8885717_abbreviations.json (96%)
 rename tests/data/{ => PMC/Pre-Oct-2024}/PMC8885717_bioc.json (99%)
 rename tests/data/{ => PMC/Pre-Oct-2024}/PMC8885717_tables.json (99%)

diff --git a/README.md b/README.md
index a26ed59..5c4cae4 100644
--- a/README.md
+++ b/README.md
@@ -23,13 +23,13 @@ pip install autocorpus
 Run the below command for a single file example
 
 ```sh
-auto-corpus -c "configs/config_pmc.json" -t "output" -f "path/to/html/file" -o JSON
+auto-corpus -c "autocorpus/configs/config_pmc.json" -t "output" -f "path/to/html/file" -o JSON
 ```
 
 Run the main app for a directory of files example
 
 ```sh
-auto-corpus -c "configs/config_pmc.json" -t "output" -f "path/to/directory/of/html/files" -o JSON
+auto-corpus -c "autocorpus/configs/config_pmc.json" -t "output" -f "path/to/directory/of/html/files" -o JSON
 ```
 
 ### Available arguments
@@ -45,7 +45,7 @@ auto-corpus -c "configs/config_pmc.json" -t "output" -f "path/to/directory/of/ht
 
 If you wish to contribute or edit a config file then please follow the instructions in the [config guide](docs/config_tutorial.md).
 
-Auto-CORPus is able to parse HTML from different publishers, which utilise different HTML structures and naming conventions. This is made possible by the inclusion of config files which tell Auto-CORPus how to identify specific sections of the article/table within the source HTML. We have supplied a config template along with example config files for [PubMed Central](configs/config_pmc.json), [Plos Genetics](configs/config_plos_genetics.json) and [Nature Genetics](configs/config_nature_genetics.json) in the [configs](configs) directory. Users of Auto-CORPus can submit their own config files for different sources via the [issues](https://github.com/omicsNLP/Auto-CORPus/issues) tab.
+Auto-CORPus is able to parse HTML from different publishers, which utilise different HTML structures and naming conventions. This is made possible by the inclusion of config files which tell Auto-CORPus how to identify specific sections of the article/table within the source HTML. We have supplied a config template along with example config files for [PubMed Central](autocorpus/configs/config_pmc.json), [Plos Genetics](autocorpus/configs/config_plos_genetics.json) and [Nature Genetics](autocorpus/configs/config_nature_genetics.json) in the [configs](autocorpus/configs) directory. Users of Auto-CORPus can submit their own config files for different sources via the [issues](https://github.com/omicsNLP/Auto-CORPus/issues) tab.
 
 **Auto-CORPus recognises 2 types of input file which are:**
 
@@ -125,13 +125,13 @@ To get started:
 1. Run the main app for a single file example:
 
    ```sh
-   python -m autocorpus -c "configs/config_pmc.json" -t "output" -f "path/to/html/file" -o JSON
+   python -m autocorpus -c "autocorpus/configs/config_pmc.json" -t "output" -f "path/to/html/file" -o JSON
    ```
 
 1. Run the main app for a directory of files example
 
    ```sh
-   python -m autocorpus -c "configs/config_pmc.json" -t "output" -f "path/to/directory/of/html/files" -o JSON
+   python -m autocorpus -c "autocorpus/configs/config_pmc.json" -t "output" -f "path/to/directory/of/html/files" -o JSON
    ```
 
 **Note:** The `auto-corpus` commandline script is also available and will behave the same as `python -m autocorpus`
diff --git a/configs/config_nature_genetics.json b/autocorpus/configs/config_nature_genetics.json
similarity index 100%
rename from configs/config_nature_genetics.json
rename to autocorpus/configs/config_nature_genetics.json
diff --git a/configs/config_plos_genetics.json b/autocorpus/configs/config_plos_genetics.json
similarity index 100%
rename from configs/config_plos_genetics.json
rename to autocorpus/configs/config_plos_genetics.json
diff --git a/autocorpus/configs/config_pmc.json b/autocorpus/configs/config_pmc.json
new file mode 100644
index 0000000..b00297c
--- /dev/null
+++ b/autocorpus/configs/config_pmc.json
@@ -0,0 +1,197 @@
+{
+    "config": {
+        "references": {
+            "data": {
+                "title": [
+                ],
+                "journal": [
+                ],
+                "volume": [
+                ]
+            },
+            "defined-by": [
+                {
+                    "tag": "li"
+                },
+                {
+                    "tag": "p",
+                    "attrs": {
+                        "id": "(__){0,2}p\\d+"
+                    }
+                },
+                {
+                    "xpath": "//*[@class=\"ref-list\"]"
+                }
+            ]
+        },
+        "title": {
+            "data": {},
+            "defined-by": [
+                {
+                    "tag": "h1",
+                    "xpath": "/html/body/div[2]/div[2]/div/div[1]/div/div[2]/main/article/section[1]/section[2]/div/hgroup/h1"
+                }
+            ]
+        },
+        "keywords": {
+            "data": {},
+            "defined-by": [
+                {
+                    "tag": "section",
+                    "attrs": {
+                        "class": [
+                            "kwd-group"
+                        ]
+                    }
+                }
+            ]
+        },
+        "abbreviations-table": {
+            "data": {},
+            "defined-by": [
+                {
+                    "tag": "table",
+                    "attrs": {
+                        "class": "glossary"
+                    }
+                }
+            ]
+        },
+        "sections": {
+            "data": {
+                "headers": [
+                    {
+                        "tag": "h2",
+                        "attrs": {
+                            "class": "pmc_sec_title"
+                        }
+                    },
+                    {
+                        "tag": "h2"
+                    }
+                ]
+            },
+            "defined-by": [
+                {
+                    "xpath": "//section[contains(@class, 'body')]/section"
+                }
+            ]
+        },
+        "sub-sections": {
+            "data": {
+                "headers": [
+                    {
+                        "tag": "h[3-6]",
+                        "attrs": {
+                            "class": "pmc_sec_title"
+                        }
+                    }
+                ]
+            },
+            "defined-by": [
+                {
+                    "tag": "section",
+                    "xpath": "//section[contains(@class, 'body')]/section/section"
+                }
+            ]
+        },
+        "paragraphs": {
+            "data": {},
+            "defined-by": [
+                {
+                    "tag": "p"
+                },
+                {
+                    "tag": "p",
+                    "xpath": "//section[contains(@class, 'body')]/section//p"
+                }
+            ]
+        },
+        "tables": {
+            "data": {
+                "caption": [
+                    {
+                        "tag": "div",
+                        "attrs": {
+                            "class": "caption"
+                        }
+                    }
+                ],
+                "table-content": [
+                    {
+                        "tag": "table"
+                    }
+                ],
+                "title": [
+                    {
+                        "tag": "h4",
+                        "attrs": {
+                            "class": "obj_head"
+                        }
+                    }
+                ],
+                "footer": [
+                    {
+                        "tag": "div",
+                        "attrs": {
+                            "class": "tw-foot"
+                        }
+                    }
+                ],
+                "table-row": [
+                    {
+                        "tag": "tr"
+                    }
+                ],
+                "header-row": [
+                    {
+                        "tag": "thead"
+                    }
+                ],
+                "header-element": [
+                    {
+                        "tag": "th"
+                    }
+                ]
+            },
+            "defined-by": [
+                {
+                    "tag": "section",
+                    "attrs": {
+                        "class": "tw"
+                    }
+                }
+            ]
+        },
+        "figures": {
+            "data": {
+                "caption": [
+                    {
+                        "tag": "p"
+                    }
+                ]
+            },
+            "defined-by": [
+                {
+                    "tag": "figcaption"
+                }
+            ]
+        }
+    },
+    "contributions": {
+        "author": {
+            "name": "Tom Shorter",
+            "contact_email": "ts339@le.ac.uk",
+            "comments": "Provided with Auto-CORPus for processing PubMed Central HTML files"
+        },
+        "editors": [
+            {
+                "name": "Thomas Rowlands",
+                "contact_email": "",
+                "date_edited": "28/11/2024",
+                "comments": "Modified for compatibility with PMC website changes from October 2024."
+            }
+        ]
+    },
+    "example_source_HTML_URL": "https://pmc.ncbi.nlm.nih.gov/articles/PMC8885717"
+}
diff --git a/configs/config_pmc.json b/autocorpus/configs/config_pmc_pre_oct_2024.json
similarity index 100%
rename from configs/config_pmc.json
rename to autocorpus/configs/config_pmc_pre_oct_2024.json
diff --git a/configs/config_template.json b/autocorpus/configs/config_template.json
similarity index 100%
rename from configs/config_template.json
rename to autocorpus/configs/config_template.json
diff --git a/tests/data/PMC/Current/PMC8885717.html b/tests/data/PMC/Current/PMC8885717.html
new file mode 100644
index 0000000..190226c
--- /dev/null
+++ b/tests/data/PMC/Current/PMC8885717.html
@@ -0,0 +1,3003 @@
+
+<!DOCTYPE html>
+<html lang="en" >
+    <head >
+
+        <meta charset="UTF-8" />
+        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+        <meta name="HandheldFriendly" content="True" />
+        <meta name="MobileOptimized" content="320" />
+        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+
+        
+        
+
+        
+        
+  <link rel="stylesheet" href="/static/assets/style-70b9163a.css" />
+<script type="module" crossorigin="" src="/static/assets/base_style-ec2bc71e.js"></script>
+
+  <link rel="stylesheet" href="/static/assets/style-ef962842.css" />
+<link rel="stylesheet" href="/static/assets/style-3ade8b5c.css" />
+<script type="module" crossorigin="" src="/static/assets/article_style-d757a0dd.js"></script>
+
+  
+  
+    <style>
+  
+  
+  @media screen and (min-width: 64em) {
+    div.pmc-wm {
+      background: repeat-y;
+      background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='20' height='350' xmlns:xlink='http://www.w3.org/1999/xlink'%3E%3Cdefs%3E%3Cfilter x='-.02' y='0' width='1.05' height='1' id='c'%3E%3CfeFlood flood-color='%23FFF'/%3E%3CfeComposite in='SourceGraphic'/%3E%3C/filter%3E%3Ctext id='b' font-family='Helvetica' font-size='11pt' style='opacity:1;fill:%23005ea2;stroke:none;text-anchor:middle' x='175' y='14'%3E%3C/text%3E%3Cpath id='a' style='fill:%23005ea2' d='M0 8h350v3H0z'/%3E%3C/defs%3E%3Cuse xlink:href='%23a' transform='rotate(90 10 10)'/%3E%3Cuse xlink:href='%23b' transform='rotate(90 10 10)' filter='url(%23c)'/%3E%3C/svg%3E");
+      padding-left: 3rem;
+    }
+  }
+</style>
+
+  
+
+
+
+        
+            <link rel="apple-touch-icon"
+                  sizes="180x180"
+                  href="/static/img/favicons/apple-touch-icon.png" />
+            <link rel="icon"
+                  type="image/png"
+                  sizes="48x48"
+                  href="/static/img/favicons/favicon-48x48.png" />
+            <link rel="icon"
+                  type="image/png"
+                  sizes="32x32"
+                  href="/static/img/favicons/favicon-32x32.png" />
+            <link rel="icon"
+                  type="image/png"
+                  sizes="16x16"
+                  href="/static/img/favicons/favicon-16x16.png" />
+            <link rel="manifest" href="/static/img/favicons/site.webmanifest" />
+            <link rel="mask-icon"
+                  href="/static/img/favicons/safari-pinned-tab.svg"
+                  color="#0071bc" />
+            <meta name="msapplication-config"
+                  content="/static/img/favicons/browserconfig.xml" />
+            <meta name="theme-color" content="#ffffff" />
+        
+
+        <title>
+            Auto-CORPus: A Natural Language Processing Tool for Standardizing and Reusing Biomedical Literature - PMC
+        </title>
+
+        
+        
+  
+  <!-- Logging params: Pinger defaults -->
+<meta name="ncbi_app" content="cloudpmc-viewer" />
+<meta name="ncbi_db" content="pmc" />
+<meta name="ncbi_phid" content="E5FDA0AA7445CA2304A0AA000D3ABEDC.m_1" />
+<!-- Logging params: Pinger custom -->
+<meta name="ncbi_pdid" content="article" />
+  
+    <link rel="preconnect" href="https://www.google-analytics.com" />
+
+    
+        <link rel="dns-prefetch" href="https://cdn.ncbi.nlm.nih.gov" />
+    
+
+    <link rel="preconnect" href="https://code.jquery.com" />
+    <meta name="ncbi_domain" content="fdh">
+<meta name="ncbi_type" content="fulltext">
+<meta name="ncbi_pcid" content="journal">
+<meta name="ncbi_feature" content="associated_data">
+<link rel="canonical" href="https://pmc.ncbi.nlm.nih.gov/articles/PMC8885717/">
+<meta name="robots" content="INDEX,NOFOLLOW,NOARCHIVE">
+<meta name="citation_journal_title" content="Frontiers in Digital Health">
+<meta name="citation_title" content="Auto-CORPus: A Natural Language Processing Tool for Standardizing and Reusing Biomedical Literature">
+<meta name="citation_author" content="Tim Beck">
+<meta name="citation_author_institution" content="Department of Genetics and Genome Biology, University of Leicester, Leicester, United Kingdom">
+<meta name="citation_author_institution" content="Health Data Research UK (HDR UK), London, United Kingdom">
+<meta name="citation_author" content="Tom Shorter">
+<meta name="citation_author_institution" content="Department of Genetics and Genome Biology, University of Leicester, Leicester, United Kingdom">
+<meta name="citation_author" content="Yan Hu">
+<meta name="citation_author_institution" content="Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom">
+<meta name="citation_author_institution" content="Department of Surgery and Cancer, Imperial College London, London, United Kingdom">
+<meta name="citation_author" content="Zhuoyu Li">
+<meta name="citation_author_institution" content="Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom">
+<meta name="citation_author" content="Shujian Sun">
+<meta name="citation_author_institution" content="Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom">
+<meta name="citation_author" content="Casiana M Popovici">
+<meta name="citation_author_institution" content="Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom">
+<meta name="citation_author_institution" content="Department of Surgery and Cancer, Imperial College London, London, United Kingdom">
+<meta name="citation_author" content="Nicholas A R McQuibban">
+<meta name="citation_author_institution" content="Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom">
+<meta name="citation_author_institution" content="Centre for Integrative Systems Biology and Bioinformatics (CISBIO), Department of Life Sciences, Imperial College London, London, United Kingdom">
+<meta name="citation_author" content="Filip Makraduli">
+<meta name="citation_author_institution" content="Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom">
+<meta name="citation_author" content="Cheng S Yeung">
+<meta name="citation_author_institution" content="Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom">
+<meta name="citation_author" content="Thomas Rowlands">
+<meta name="citation_author_institution" content="Department of Genetics and Genome Biology, University of Leicester, Leicester, United Kingdom">
+<meta name="citation_author" content="Joram M Posma">
+<meta name="citation_author_institution" content="Health Data Research UK (HDR UK), London, United Kingdom">
+<meta name="citation_author_institution" content="Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom">
+<meta name="citation_publication_date" content="2022 Feb 15">
+<meta name="citation_volume" content="4">
+<meta name="citation_firstpage" content="788124">
+<meta name="citation_doi" content="10.3389/fdgth.2022.788124">
+<meta name="citation_pmid" content="35243479">
+<meta name="citation_abstract_html_url" content="https://pmc.ncbi.nlm.nih.gov/articles/PMC8885717/">
+<meta name="citation_fulltext_html_url" content="https://pmc.ncbi.nlm.nih.gov/articles/PMC8885717/">
+<meta name="citation_pdf_url" content="https://pmc.ncbi.nlm.nih.gov/articles/PMC8885717/pdf/fdgth-04-788124.pdf">
+<meta name="description" content="To analyse large corpora using machine learning and other Natural Language Processing (NLP) algorithms, the corpora need to be standardized. The BioC format is a community-driven simple data structure for sharing text and annotations, however there ...">
+<meta name="og:title" content="Auto-CORPus: A Natural Language Processing Tool for Standardizing and Reusing Biomedical Literature">
+<meta name="og:type" content="article">
+<meta name="og:site_name" content="PubMed Central (PMC)">
+<meta name="og:description" content="To analyse large corpora using machine learning and other Natural Language Processing (NLP) algorithms, the corpora need to be standardized. The BioC format is a community-driven simple data structure for sharing text and annotations, however there ...">
+<meta name="og:url" content="https://pmc.ncbi.nlm.nih.gov/articles/PMC8885717/">
+<meta name="og:image" content="https://cdn.ncbi.nlm.nih.gov/pmc/cms/images/pmc-card-share.jpg?_=0">
+<meta name="twitter:card" content="summary_large_image">
+<meta name="twitter:site" content="@ncbi">
+    
+    
+
+    </head>
+    <body >
+        
+    <a class="usa-skipnav " href="#main-content">
+      Skip to main content
+    </a>
+
+
+        
+            
+
+<section class="usa-banner " aria-label="Official website of the United States government" >
+    <div class="usa-accordion">
+        <header class="usa-banner__header">
+            <div class="usa-banner__inner">
+                <div class="grid-col-auto">
+                    <img aria-hidden="true"
+                         class="usa-banner__header-flag"
+                         src="/static/img/us_flag.svg"
+                         alt="" />
+                </div>
+
+                <div class="grid-col-fill tablet:grid-col-auto" aria-hidden="true">
+                    <p class="usa-banner__header-text">
+                        An official website of the United States government
+                    </p>
+                    <span class="usa-banner__header-action">Here's how you know</span>
+                </div>
+
+                
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    
+        <button
+            type="button"
+        
+    
+    class="usa-accordion__button usa-banner__button
+           
+
+           
+               
+               
+               
+               
+            
+
+           
+           
+           
+           "
+    aria-expanded="false"
+    aria-controls="gov-banner-default"
+    
+    data-testid="storybook-django-banner"
+    
+    >
+    
+        
+
+        
+                    <span class="usa-banner__button-text">Here's how you know</span>
+                
+
+        
+    
+        
+            </button>
+        
+
+
+            </div>
+        </header>
+
+        <div class="usa-banner__content usa-accordion__content"
+             id="gov-banner-default"
+             hidden>
+            <div class="grid-row grid-gap-lg">
+                <div class="usa-banner__guidance tablet:grid-col-6">
+                    <img class="usa-banner__icon usa-media-block__img"
+                         src="/static/img/icon-dot-gov.svg"
+                         alt=""
+                         aria-hidden="true" />
+                    <div class="usa-media-block__body">
+                        <p>
+                            <strong>Official websites use .gov</strong>
+                            <br />
+                            A
+                            <strong>.gov</strong> website belongs to an official
+                            government organization in the United States.
+                        </p>
+                    </div>
+                </div>
+
+                <div class="usa-banner__guidance tablet:grid-col-6">
+                    <img class="usa-banner__icon usa-media-block__img"
+                         src="/static/img/icon-https.svg"
+                         alt=""
+                         aria-hidden="true" />
+
+                    <div class="usa-media-block__body">
+                        <p>
+                            <strong>Secure .gov websites use HTTPS</strong>
+                            <br />
+                            A <strong>lock</strong> (
+                            <span class="icon-lock">
+                                <svg xmlns="http://www.w3.org/2000/svg"
+                                     width="52"
+                                     height="64"
+                                     viewBox="0 0 52 64"
+                                     class="usa-banner__lock-image"
+                                     role="graphics-symbol"
+                                     aria-labelledby="banner-lock-description"
+                                     focusable="false">
+                                    <title id="banner-lock-title">Lock</title>
+                                    <desc id="banner-lock-description">
+                                    Locked padlock icon
+                                    </desc>
+                                    <path fill="#000000"
+                                          fill-rule="evenodd"
+                                          d="M26 0c10.493 0 19 8.507 19 19v9h3a4 4 0 0 1 4 4v28a4 4 0 0 1-4 4H4a4 4 0 0 1-4-4V32a4 4 0 0 1 4-4h3v-9C7 8.507 15.507 0 26 0zm0 8c-5.979 0-10.843 4.77-10.996 10.712L15 19v9h22v-9c0-6.075-4.925-11-11-11z" />
+                                </svg>
+</span>) or <strong>https://</strong> means you've safely
+                                connected to the .gov website. Share sensitive
+                                information only on official, secure websites.
+                            </p>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </section>
+
+        
+
+        
+    
+    
+
+<div class="usa-overlay">
+</div>
+
+
+
+<header class="usa-header usa-header--extended usa-header--wide" data-testid="header" data-header >
+    <div class="ncbi-header">
+        <div class="ncbi-header__container">
+            
+                <a class="ncbi-header__logo-container" href="/">
+                    <img alt="
+                                  PMC home page
+                              "
+                         class="ncbi-header__logo-image"
+                         src="/static/img/ncbi-logos/nih-nlm-ncbi--white.svg" />
+                </a>
+            
+
+            <!-- Mobile menu hamburger button -->
+            
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    
+        <button
+            type="button"
+        
+    
+    class="usa-menu-btn ncbi-header__hamburger-button
+           
+
+           
+               
+               
+               
+               
+            
+
+           
+           
+           
+           "
+    
+    
+    aria-label="Show menu"
+    data-testid="navMenuButton"
+    
+    >
+    
+        
+
+        
+                <svg aria-hidden="true"
+                     class="ncbi-hamburger-icon"
+                     fill="none"
+                     focusable="false"
+                     height="21"
+                     viewBox="0 0 31 21"
+                     width="31"
+                     xmlns="http://www.w3.org/2000/svg">
+                    <path clip-rule="evenodd"
+                          d="M0.125 20.75H30.875V17.3333H0.125V20.75ZM0.125 12.2083H30.875V8.79167H0.125V12.2083ZM0.125 0.25V3.66667H30.875V0.25H0.125Z"
+                          fill="#F1F1F1"
+                          fill-rule="evenodd" />
+                </svg>
+            
+
+        
+    
+        
+            </button>
+        
+
+
+
+            
+                <!-- Desktop buttons-->
+                <div class="ncbi-header__desktop-buttons">
+                    
+                        <!-- Desktop search button -->
+                        
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    
+        <button
+            type="button"
+        
+    
+    class="usa-button
+           
+
+           
+               
+               
+               
+               
+            
+
+           
+           
+           usa-button--unstyled ncbi-header__desktop-button
+           "
+    aria-expanded="false"
+    aria-controls="search-field-desktop-navigation"
+    aria-label="Show search overlay"
+    data-testid="toggleSearchPanelButton"
+    data-toggle-search-panel-button
+    >
+    
+        
+
+        
+                            
+
+
+    <svg class="usa-icon " role="graphics-symbol" aria-hidden="true"    >
+        
+        <use xlink:href="/static/img/sprite.svg#search" />
+    </svg>
+
+
+                            Search
+                        
+
+        
+    
+        
+            </button>
+        
+
+
+                    
+
+                    <!-- Desktop login dropdown -->
+                    
+                        <div class="ncbi-header__login-dropdown">
+                            
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    
+        <button
+            type="button"
+        
+    
+    class="usa-button
+           
+
+           
+               
+               
+               
+               
+            
+
+           
+           
+           usa-button--unstyled ncbi-header__desktop-button ncbi-header__login-dropdown-button
+           "
+    aria-expanded="false"
+    aria-controls="login-dropdown-menu"
+    aria-label="Show login menu"
+    data-testid="toggleLoginMenuDropdown"
+    data-desktop-login-button
+    >
+    
+        
+
+        
+                                
+
+
+    <svg class="usa-icon " role="graphics-symbol" aria-hidden="true"    >
+        
+        <use xlink:href="/static/img/sprite.svg#person" />
+    </svg>
+
+
+
+                                <span data-login-dropdown-text>Log in</span>
+
+                                <!-- Dropdown icon pointing up -->
+                                
+
+
+    <svg class="usa-icon ncbi-header__login-dropdown-icon ncbi-header__login-dropdown-icon--expand-less ncbi-header__login-dropdown-icon--hidden" role="graphics-symbol" aria-hidden="true"    data-login-dropdown-up-arrow>
+        
+        <use xlink:href="/static/img/sprite.svg#expand_less" />
+    </svg>
+
+
+
+                                <!-- Dropdown icon pointing down -->
+                                
+
+
+    <svg class="usa-icon ncbi-header__login-dropdown-icon ncbi-header__login-dropdown-icon--expand-more ncbi-header__login-dropdown-icon--hidden" role="graphics-symbol" aria-hidden="true"    data-login-dropdown-down-arrow>
+        
+        <use xlink:href="/static/img/sprite.svg#expand_more" />
+    </svg>
+
+
+                            
+
+        
+    
+        
+            </button>
+        
+
+
+
+                            <!-- Login dropdown menu -->
+                            <ul class="usa-nav__submenu ncbi-header__login-dropdown-menu"
+                                id="login-dropdown-menu"
+                                data-desktop-login-menu-dropdown
+                                hidden>
+                                
+                                    <li class="usa-nav__submenu-item">
+                                        <!-- Uses custom style overrides to render external and document links. -->
+                                        
+
+
+
+
+
+
+
+
+<a href="https://www.ncbi.nlm.nih.gov/myncbi/" class="usa-link  "  >
+    
+
+    Dashboard
+
+    
+</a>
+
+                                    </li>
+                                
+                                    <li class="usa-nav__submenu-item">
+                                        <!-- Uses custom style overrides to render external and document links. -->
+                                        
+
+
+
+
+
+
+
+
+<a href="https://www.ncbi.nlm.nih.gov/myncbi/collections/bibliography/" class="usa-link  "  >
+    
+
+    Publications
+
+    
+</a>
+
+                                    </li>
+                                
+                                    <li class="usa-nav__submenu-item">
+                                        <!-- Uses custom style overrides to render external and document links. -->
+                                        
+
+
+
+
+
+
+
+
+<a href="https://www.ncbi.nlm.nih.gov/account/settings/" class="usa-link  "  >
+    
+
+    Account settings
+
+    
+</a>
+
+                                    </li>
+                                
+                                <li class="usa-nav__submenu-item">
+                                    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    
+        <button
+            type="button"
+        
+    
+    class="usa-button
+           
+
+           
+               
+               
+               
+               
+            
+
+           
+           
+           usa-button--outline ncbi-header__login-dropdown-logout-button
+           "
+    
+    
+    
+    data-testid="desktopLogoutButton"
+    data-desktop-logout-button
+    >
+    
+        
+
+        Log out
+
+        
+    
+        
+            </button>
+        
+
+
+                                </li>
+                            </ul>
+                        </div>
+                    
+                </div>
+            
+        </div>
+    </div>
+
+    <!-- Search panel -->
+    
+        <div class="ncbi-search-panel ncbi--show-only-at-desktop"
+             data-testid="searchPanel"
+             data-header-search-panel
+             hidden>
+            <div class="ncbi-search-panel__container">
+                <form action="https://www.ncbi.nlm.nih.gov/search/all/"
+                      aria-describedby="search-field-desktop-navigation-help-text"
+                      autocomplete="off"
+                      class="usa-search usa-search--big ncbi-search-panel__form"
+                      data-testid="form"
+                      method="GET"
+                      role="search">
+                    <label class="usa-sr-only"
+                           data-testid="label"
+                           for="search-field-desktop-navigation">
+                        Search…
+                    </label>
+                    <input class="usa-input"
+                           data-testid="textInput"
+                           id="search-field-desktop-navigation"
+                           name="term"
+                           
+                               placeholder="Search NCBI"
+                           
+                           type="search"
+                           value="" />
+                    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    
+        <button
+            type="submit"
+        
+    
+    class="usa-button
+           
+
+           
+               
+               
+               
+               
+            
+
+           
+           
+           
+           "
+    
+    
+    
+    data-testid="button"
+    
+    >
+    
+        
+
+        
+                        <span class="usa-search__submit-text">
+                            Search NCBI
+                        </span>
+                    
+
+        
+    
+        
+            </button>
+        
+
+
+                </form>
+
+                
+            </div>
+        </div>
+    
+
+    <nav aria-label="Primary navigation" class="usa-nav">
+        <p class="usa-sr-only" id="primary-navigation-sr-only-title">
+            Primary site navigation
+        </p>
+
+        <!-- Mobile menu close button -->
+        
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    
+        <button
+            type="button"
+        
+    
+    class="usa-nav__close ncbi-nav__close-button
+           
+
+           
+               
+               
+               
+               
+            
+
+           
+           
+           
+           "
+    
+    
+    aria-label="Close navigation menu"
+    data-testid="navCloseButton"
+    
+    >
+    
+        
+
+        
+            <img src="/static/img/usa-icons/close.svg" alt="Close" />
+        
+
+        
+    
+        
+            </button>
+        
+
+
+
+        
+            <!-- Mobile search component -->
+            <form class="usa-search usa-search--small ncbi--hide-at-desktop margin-top-6"
+                  role="search">
+                <label class="usa-sr-only" for="search-field">
+                    Search
+                </label>
+
+                <input class="usa-input"
+                       id="search-field-mobile-navigation"
+                       type="search"
+                       
+                           placeholder="Search NCBI"
+                       
+                       name="search" />
+
+                
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    
+        <button
+            type="submit"
+        
+    
+    class="usa-button
+           
+
+           
+               
+               
+               
+               
+            
+
+           
+           
+           
+           "
+    
+    
+    
+    
+    
+    >
+    
+        
+
+        
+                    <!-- This SVG should be kept inline and not replaced with a link to the icon as otherwise it will render in the wrong color -->
+                    <img src="data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIGhlaWdodD0iMjQiIHZpZXdCb3g9IjAgMCAyNCAyNCIgd2lkdGg9IjI0Ij48cGF0aCBkPSJNMCAwaDI0djI0SDB6IiBmaWxsPSJub25lIi8+PHBhdGggZmlsbD0iI2ZmZiIgZD0iTTE1LjUgMTRoLS43OWwtLjI4LS4yN0E2LjQ3MSA2LjQ3MSAwIDAgMCAxNiA5LjUgNi41IDYuNSAwIDEgMCA5LjUgMTZjMS42MSAwIDMuMDktLjU5IDQuMjMtMS41N2wuMjcuMjh2Ljc5bDUgNC45OUwyMC40OSAxOWwtNC45OS01em0tNiAwQzcuMDEgMTQgNSAxMS45OSA1IDkuNVM3LjAxIDUgOS41IDUgMTQgNy4wMSAxNCA5LjUgMTEuOTkgMTQgOS41IDE0eiIvPjwvc3ZnPg=="
+                         class="usa-search__submit-icon"
+                         alt="Search" />
+                
+
+        
+    
+        
+            </button>
+        
+
+
+            </form>
+
+            
+        
+
+        <!-- Primary navigation menu items -->
+        <!-- This usa-nav__inner wrapper is required to correctly style the navigation items on Desktop -->
+        
+
+        
+            <div class="ncbi-nav__mobile-login-menu ncbi--hide-at-desktop"
+                 data-mobile-login-menu
+                 hidden>
+                <p class="ncbi-nav__mobile-login-menu-status">
+                    Logged in as:
+                    <strong class="ncbi-nav__mobile-login-menu-email"
+                            data-mobile-login-email-text></strong>
+                </p>
+                <ul class="usa-nav__primary usa-accordion">
+                    
+                        <li class="usa-nav__primary-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.ncbi.nlm.nih.gov/myncbi/" class="usa-link  "  >
+    
+
+    Dashboard
+
+    
+</a>
+
+                        </li>
+                    
+                        <li class="usa-nav__primary-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.ncbi.nlm.nih.gov/myncbi/collections/bibliography/" class="usa-link  "  >
+    
+
+    Publications
+
+    
+</a>
+
+                        </li>
+                    
+                        <li class="usa-nav__primary-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.ncbi.nlm.nih.gov/account/settings/" class="usa-link  "  >
+    
+
+    Account settings
+
+    
+</a>
+
+                        </li>
+                    
+                </ul>
+            </div>
+        
+
+        
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    
+        <button
+            type="button"
+        
+    
+    class="usa-button
+           
+
+           
+               
+               
+               
+               
+            
+
+           
+           
+           ncbi-nav__mobile-login-button ncbi--hide-at-desktop
+           "
+    
+    
+    
+    data-testid="mobileLoginButton"
+    data-mobile-login-button
+    >
+    
+        
+
+        Log in
+
+        
+    
+        
+            </button>
+        
+
+
+    </nav>
+</header>
+
+    
+        
+
+<section class="pmc-header pmc-header--basic" aria-label="PMC Header with search box">
+    <div class="pmc-nav-container">
+        <div class="pmc-header__bar">
+           <div class="pmc-header__logo">
+               <a href="/" title="Home" aria-label="PMC Home"></a>
+           </div>
+            <button
+                    type="button"
+                    class="usa-button usa-button--unstyled pmc-header__search__button"
+                    aria-label="Open search"
+                    data-ga-category="search"
+                    data-ga-action="PMC"
+                    data-ga-label="pmc_search_panel_mobile"
+            >
+                <svg class="usa-icon width-4 height-4 pmc-icon__open" aria-hidden="true" focusable="false" role="img">
+                    <use xlink:href="/static/img/sprite.svg#search"></use>
+                </svg>
+                <svg class="usa-icon width-4 height-4 pmc-icon__close" aria-hidden="true" focusable="false" role="img">
+                    <use xlink:href="/static/img/sprite.svg#close"></use>
+                </svg>
+            </button>
+        </div>
+        <div class="pmc-header__search">
+            
+
+
+<form class="usa-search usa-search--extra usa-search--article-right-column pmc-header__search__form" autocomplete="off" role="search">
+<label class="usa-sr-only" for="pmc-search">Search PMC Full-Text Archive</label>
+<span class="autoComplete_wrapper flex-1">
+<input class="usa-input width-full maxw-none" required="required" placeholder="Search PMC Full-Text Archive" id="pmc-search" type="search" name="term" data-autocomplete-url="/search/autocomplete/"/>
+</span>
+<button
+class="usa-button"
+type="submit"
+formaction="https://www.ncbi.nlm.nih.gov/pmc/"
+data-ga-category="search"
+data-ga-action="PMC"
+data-ga-label="PMC_search_button"
+>
+<span class="usa-search__submit-text">Search in PMC</span>
+<img
+src="/static/img/usa-icons-bg/search--white.svg"
+class="usa-search__submit-icon"
+alt="Search"
+/>
+</button>
+</form>
+            <ul class="pmc-header__search__menu">
+                <li>
+                    <a class="usa-link" href="https://www.ncbi.nlm.nih.gov/pmc/advanced/" data-ga-action="featured_link" data-ga-label="advanced_search">
+                        Advanced Search
+                    </a>
+                </li>
+                <li>
+                    
+                        <a class="usa-link" href="/journals/" data-ga-action="featured_link" data-ga-label="journal list">
+                            Journal List
+                        </a>
+                    
+                </li>
+                <li>
+                    
+                        <a class="usa-link" href="/about/userguide/" data-ga-action="featured_link"
+                        data-ga-label="user guide">
+                            User Guide
+                        </a>
+                    
+                </li>
+            </ul>
+        </div>
+    </div>
+</section>
+
+    
+
+
+        
+        
+
+       
+  <div class="usa-section padding-top-0 desktop:padding-top-6 pmc-article-section" data-article-db="pmc" data-article-id="8885717">
+
+    
+
+   
+
+
+
+<div class="grid-container pmc-actions-bar" aria-label="Actions bar" role="complementary">
+    <div class="grid-row">
+        <div class="grid-col-fill display-flex">
+             <div class="display-flex">
+                <ul class="usa-list usa-list--unstyled usa-list--horizontal">
+                    <li class="margin-right-2 mobile-lg:margin-right-4 display-flex mob">
+                        <button
+                                type="button"
+                                class="usa-button pmc-sidenav__container__open usa-button--unstyled width-auto display-flex"
+                                aria-label="Open resources"
+                                data-extra-class="is-visible-resources"
+                                data-ga-category="resources_accordion"
+                                data-ga-action="click"
+                                data-ga-label="mobile_icon"
+                        >
+                            <svg class="usa-icon width-4 height-4" aria-hidden="true" focusable="false" role="img">
+                                <use xlink:href="/static/img/sprite.svg#more_vert"></use>
+                            </svg>
+                        </button>
+                    </li>
+                    
+                    <li class="margin-right-2 mobile-lg:margin-right-4 display-flex mob">
+                        <a
+                                href="https://doi.org/10.3389/fdgth.2022.788124"
+                                class="usa-link display-flex"
+                                role="button"
+                                target="_blank"
+                                rel="noreferrer noopener"
+                                aria-label="View on publisher site"
+                                data-ga-category="actions"
+                                data-ga-action="click"
+                                data-ga-label="publisher_link_mobile"
+                        >
+                                <svg class="usa-icon width-4 height-4" aria-hidden="true" focusable="false" role="img">
+                                    <use xlink:href="/static/img/sprite.svg#launch"></use>
+                                </svg>
+                        </a>
+                    </li>
+                    
+                    
+                        <li class="margin-right-2 mobile-lg:margin-right-4 display-flex">
+                             <a
+                                     href="pdf/fdgth-04-788124.pdf"
+                                     class="usa-link display-flex"
+                                     role="button"
+                                     aria-label="Download PDF"
+                                     data-ga-category="actions"
+                                     data-ga-action="click"
+                                     data-ga-label="pdf_download_mobile"
+                             >
+                                <svg class="usa-icon width-4 height-4" aria-hidden="true" focusable="false" role="img">
+                                    <use xlink:href="/static/img/sprite.svg#file_download"></use>
+                                </svg>
+                            </a>
+                        </li>
+                    
+                    <li class="margin-right-2 mobile-lg:margin-right-4 display-flex">
+                        <button
+                                class="usa-button usa-button--unstyled collections-dialog-trigger collections-button display-flex collections-button-empty"
+                                 aria-label="Save article in MyNCBI collections."
+                                  data-ga-category="actions"
+                                  data-ga-action="click"
+                                  data-ga-label="collections_button_mobile"
+                                  data-collections-open-dialog-enabled="false"
+                                  data-collections-open-dialog-url="https://account.ncbi.nlm.nih.gov/?back_url=https%3A%2F%2Fpmc.ncbi.nlm.nih.gov%2Farticles%2FPMC8885717%2F%23open-collections-dialog"
+                                  data-in-collections="false"
+                        >
+                            <svg class="usa-icon width-4 height-4 usa-icon--bookmark-full" aria-hidden="true" focusable="false" role="img" hidden>
+                                <use xlink:href="/static/img/action-bookmark-full.svg#icon"></use>
+                            </svg>
+                            <svg class="usa-icon width-4 height-4 usa-icon--bookmark-empty" aria-hidden="true" focusable="false" role="img" hidden>
+                                <use xlink:href="/static/img/action-bookmark-empty.svg#icon"></use>
+                            </svg>
+                        </button>
+                    </li>
+                    
+                    <li class="margin-right-2 mobile-lg:margin-right-4 display-flex">
+                        <button role="button" class="usa-button usa-button--unstyled citation-dialog-trigger display-flex"
+                            aria-label="Open dialog with citation text in different styles"
+                            data-ga-category="actions"
+                            data-ga-action="open"
+                            data-ga-label="cite_mobile"
+                            data-all-citations-url="/resources/citations/8885717/"
+                            data-citation-style="nlm"
+                            data-download-format-link="/resources/citations/8885717/export/"
+                        >
+                            <svg class="usa-icon width-4 height-4 usa-icon--bookmark-empty" aria-hidden="true" focusable="false" role="img" hidden>
+                                <use xlink:href="/static/img/sprite.svg#format_quote"></use>
+                            </svg>
+                        </button>
+                    </li>
+                    
+                    <li class="pmc-permalink display-flex">
+                         <button
+                                 type="button"
+                                 class="usa-button usa-button--unstyled display-flex"
+                                 aria-label="Show article permalink"
+                                 aria-expanded="false"
+                                 aria-haspopup="true"
+                                 data-ga-category="actions"
+                                 data-ga-action="open"
+                                 data-ga-label="permalink_mobile"
+                         >
+                            <svg class="usa-icon width-4 height-4" aria-hidden="true" focusable="false" role="img">
+                                <use xlink:href="/static/img/sprite.svg#share"></use>
+                            </svg>
+                        </button>
+                        
+
+<div class="pmc-permalink__dropdown" hidden>
+    <div class="pmc-permalink__dropdown__container">
+          <h2 class="usa-modal__heading margin-top-0 margin-bottom-2">PERMALINK</h2>
+          <div class="pmc-permalink__dropdown__content">
+              <input type="text" class="usa-input" value="https://pmc.ncbi.nlm.nih.gov/articles/PMC8885717/" aria-label="Article permalink">
+              <button class="usa-button display-inline-flex pmc-permalink__dropdown__copy__btn margin-right-0" title="Copy article permalink" data-ga-category="save_share" data-ga-action="link" data-ga-label="copy_link">
+                  <svg class="usa-icon" aria-hidden="true" focusable="false" role="img">
+                    <use xlink:href="/static/img/sprite.svg#content_copy"></use>
+                  </svg>
+                  <span class="margin-left-1">Copy</span>
+              </button>
+          </div>
+    </div>
+</div>
+                    </li>
+                </ul>
+            </div>
+            <button
+                    type="button"
+                    class="usa-button pmc-sidenav__container__open usa-button--unstyled width-auto display-flex"
+                    aria-label="Open article navigation"
+                    data-extra-class="is-visible-in-page"
+                    data-ga-category="actions"
+                    data-ga-action="open"
+                    data-ga-label="article_nav_mobile"
+            >
+                <svg class="usa-icon width-4 height-4" aria-hidden="true" focusable="false" role="img">
+                    <use xlink:href="/static/img/sprite.svg#list"></use>
+                </svg>
+            </button>
+        </div>
+    </div>
+</div>
+    <div class="grid-container desktop:padding-left-6">
+      <div id="article-container" class="grid-row grid-gap">
+        <div class="grid-col-12 desktop:grid-col-8 order-2 pmc-layout__content">
+            <div class="grid-container padding-left-0 padding-right-0">
+                <div class="grid-row desktop:margin-left-neg-6">
+                    <div class="grid-col-12">
+                        <div class="pmc-layout__disclaimer" role="complementary" aria-label="Disclaimer note">
+    As a library, NLM provides access to scientific literature. Inclusion in an NLM database does not imply endorsement of, or agreement with,
+    the contents by NLM or the National Institutes of Health.<br/>
+    Learn more:
+    <a class="usa-link" data-ga-category="Link click" data-ga-action="Disclaimer" data-ga-label="New disclaimer box" href="/about/disclaimer/">PMC Disclaimer</a>
+    |
+    <a class="usa-link" data-ga-category="Link click" data-ga-action="PMC Copyright Notice" data-ga-label="New disclaimer box" href="/about/copyright/">
+        PMC Copyright Notice
+    </a>
+</div>
+                    </div>
+                </div>
+                <div class="grid-row pmc-wm desktop:margin-left-neg-6">
+                    <!-- Main content -->
+                    <main
+                      id="main-content"
+                      class="usa-layout-docs__main usa-layout-docs grid-col-12 pmc-layout pmc-prose padding-0"
+                    >
+
+                      
+                        <section class="pmc-journal-banner text-center line-height-none" aria-label="Journal banner"><img src="https://cdn.ncbi.nlm.nih.gov/pmc/banners/logo-fdh.png" alt="Frontiers in Digital Health logo" usemap="#pmc-banner-imagemap" width="500" height="75"><map name="pmc-banner-imagemap"><area alt="Link to Frontiers in Digital Health" title="Link to Frontiers in Digital Health" shape="default" href="https://www.frontiersin.org/journals/digital-health" target="_blank" rel="noopener noreferrer"></map></section><article lang="en"><section aria-label="Article citation and metadata"><section class="pmc-layout__citation font-secondary font-xs"><div>
+<div class="display-inline-block"><button type="button" class="cursor-pointer text-no-underline bg-transparent border-0 padding-0 text-left margin-0 text-normal text-primary" aria-controls="journal_context_menu">Front Digit Health</button></div>. 2022 Feb 15;4:788124. doi: <a href="https://doi.org/10.3389/fdgth.2022.788124" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">10.3389/fdgth.2022.788124</a>
+</div>
+<nav id="journal_context_menu" hidden="hidden"><ul class="menu-list font-family-ui" role="menu">
+<li role="presentation"><a href="https://www.ncbi.nlm.nih.gov/pmc/?term=%22Front%20Digit%20Health%22%5Bjour%5D" class="usa-link" role="menuitem">Search in PMC</a></li>
+<li role="presentation"><a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Front%20Digit%20Health%22%5Bjour%5D" lang="en" class="usa-link" role="menuitem">Search in PubMed</a></li>
+<li role="presentation"><a href="https://www.ncbi.nlm.nih.gov/nlmcatalog?term=%22Front%20Digit%20Health%22%5BTitle%20Abbreviation%5D" class="usa-link" role="menuitem">View in NLM Catalog</a></li>
+<li role="presentation"><a href="?term=%22Front%20Digit%20Health%22%5Bjour%5D" class="usa-link" role="menuitem" data-add-to-search="true">Add to search</a></li>
+</ul></nav></section><section class="front-matter"><div class="ameta p font-secondary font-xs">
+<hgroup><h1>Auto-CORPus: A Natural Language Processing Tool for Standardizing and Reusing Biomedical Literature</h1></hgroup><div class="cg p">
+<a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Beck%20T%22%5BAuthor%5D" class="usa-link" aria-describedby="id1"><span class="name western">Tim Beck</span></a><div hidden="hidden" id="id1">
+<h3><span class="name western">Tim Beck</span></h3>
+<div class="p">
+<sup>1</sup>Department of Genetics and Genome Biology, University of Leicester, Leicester, United Kingdom</div>
+<div class="p">
+<sup>2</sup>Health Data Research UK (HDR UK), London, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Beck%20T%22%5BAuthor%5D" class="usa-link"><span class="name western">Tim Beck</span></a>
+</div>
+</div>
+<sup>1,</sup><sup>2,</sup><sup>*</sup>, <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Shorter%20T%22%5BAuthor%5D" class="usa-link" aria-describedby="id2"><span class="name western">Tom Shorter</span></a><div hidden="hidden" id="id2">
+<h3><span class="name western">Tom Shorter</span></h3>
+<div class="p">
+<sup>1</sup>Department of Genetics and Genome Biology, University of Leicester, Leicester, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Shorter%20T%22%5BAuthor%5D" class="usa-link"><span class="name western">Tom Shorter</span></a>
+</div>
+</div>
+<sup>1,</sup><sup>†</sup>, <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Hu%20Y%22%5BAuthor%5D" class="usa-link" aria-describedby="id3"><span class="name western">Yan Hu</span></a><div hidden="hidden" id="id3">
+<h3><span class="name western">Yan Hu</span></h3>
+<div class="p">
+<sup>3</sup>Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom</div>
+<div class="p">
+<sup>4</sup>Department of Surgery and Cancer, Imperial College London, London, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Hu%20Y%22%5BAuthor%5D" class="usa-link"><span class="name western">Yan Hu</span></a>
+</div>
+</div>
+<sup>3,</sup><sup>4,</sup><sup>†</sup>, <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Li%20Z%22%5BAuthor%5D" class="usa-link" aria-describedby="id4"><span class="name western">Zhuoyu Li</span></a><div hidden="hidden" id="id4">
+<h3><span class="name western">Zhuoyu Li</span></h3>
+<div class="p">
+<sup>3</sup>Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Li%20Z%22%5BAuthor%5D" class="usa-link"><span class="name western">Zhuoyu Li</span></a>
+</div>
+</div>
+<sup>3,</sup><sup>†</sup>, <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Sun%20S%22%5BAuthor%5D" class="usa-link" aria-describedby="id5"><span class="name western">Shujian Sun</span></a><div hidden="hidden" id="id5">
+<h3><span class="name western">Shujian Sun</span></h3>
+<div class="p">
+<sup>3</sup>Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Sun%20S%22%5BAuthor%5D" class="usa-link"><span class="name western">Shujian Sun</span></a>
+</div>
+</div>
+<sup>3,</sup><sup>†</sup>, <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Popovici%20CM%22%5BAuthor%5D" class="usa-link" aria-describedby="id6"><span class="name western">Casiana M Popovici</span></a><div hidden="hidden" id="id6">
+<h3><span class="name western">Casiana M Popovici</span></h3>
+<div class="p">
+<sup>3</sup>Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom</div>
+<div class="p">
+<sup>4</sup>Department of Surgery and Cancer, Imperial College London, London, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Popovici%20CM%22%5BAuthor%5D" class="usa-link"><span class="name western">Casiana M Popovici</span></a>
+</div>
+</div>
+<sup>3,</sup><sup>4,</sup><sup>†</sup>, <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22McQuibban%20NAR%22%5BAuthor%5D" class="usa-link" aria-describedby="id7"><span class="name western">Nicholas A R McQuibban</span></a><div hidden="hidden" id="id7">
+<h3><span class="name western">Nicholas A R McQuibban</span></h3>
+<div class="p">
+<sup>3</sup>Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom</div>
+<div class="p">
+<sup>5</sup>Centre for Integrative Systems Biology and Bioinformatics (CISBIO), Department of Life Sciences, Imperial College London, London, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22McQuibban%20NAR%22%5BAuthor%5D" class="usa-link"><span class="name western">Nicholas A R McQuibban</span></a>
+</div>
+</div>
+<sup>3,</sup><sup>5</sup>, <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Makraduli%20F%22%5BAuthor%5D" class="usa-link" aria-describedby="id8"><span class="name western">Filip Makraduli</span></a><div hidden="hidden" id="id8">
+<h3><span class="name western">Filip Makraduli</span></h3>
+<div class="p">
+<sup>3</sup>Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Makraduli%20F%22%5BAuthor%5D" class="usa-link"><span class="name western">Filip Makraduli</span></a>
+</div>
+</div>
+<sup>3</sup>, <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Yeung%20CS%22%5BAuthor%5D" class="usa-link" aria-describedby="id9"><span class="name western">Cheng S Yeung</span></a><div hidden="hidden" id="id9">
+<h3><span class="name western">Cheng S Yeung</span></h3>
+<div class="p">
+<sup>3</sup>Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Yeung%20CS%22%5BAuthor%5D" class="usa-link"><span class="name western">Cheng S Yeung</span></a>
+</div>
+</div>
+<sup>3</sup>, <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Rowlands%20T%22%5BAuthor%5D" class="usa-link" aria-describedby="id10"><span class="name western">Thomas Rowlands</span></a><div hidden="hidden" id="id10">
+<h3><span class="name western">Thomas Rowlands</span></h3>
+<div class="p">
+<sup>1</sup>Department of Genetics and Genome Biology, University of Leicester, Leicester, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Rowlands%20T%22%5BAuthor%5D" class="usa-link"><span class="name western">Thomas Rowlands</span></a>
+</div>
+</div>
+<sup>1</sup>, <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Posma%20JM%22%5BAuthor%5D" class="usa-link" aria-describedby="id11"><span class="name western">Joram M Posma</span></a><div hidden="hidden" id="id11">
+<h3><span class="name western">Joram M Posma</span></h3>
+<div class="p">
+<sup>2</sup>Health Data Research UK (HDR UK), London, United Kingdom</div>
+<div class="p">
+<sup>3</sup>Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom</div>
+<div class="p">Find articles by <a href="https://pubmed.ncbi.nlm.nih.gov/?term=%22Posma%20JM%22%5BAuthor%5D" class="usa-link"><span class="name western">Joram M Posma</span></a>
+</div>
+</div>
+<sup>2,</sup><sup>3,</sup><sup>*</sup>
+</div>
+<ul class="d-buttons inline-list">
+<li><button class="d-button" aria-controls="aip_a" aria-expanded="false">Author information</button></li>
+<li><button class="d-button" aria-controls="anp_a" aria-expanded="false">Article notes</button></li>
+<li><button class="d-button" aria-controls="clp_a" aria-expanded="false">Copyright and License information</button></li>
+</ul>
+<div class="d-panels font-secondary-light">
+<div id="aip_a" class="d-panel p" style="display: none">
+<div class="p" id="aff1">
+<sup>1</sup>Department of Genetics and Genome Biology, University of Leicester, Leicester, United Kingdom</div>
+<div id="aff2">
+<sup>2</sup>Health Data Research UK (HDR UK), London, United Kingdom</div>
+<div id="aff3">
+<sup>3</sup>Section of Bioinformatics, Division of Systems Medicine, Department of Metabolism, Digestion and Reproduction, Imperial College London, London, United Kingdom</div>
+<div id="aff4">
+<sup>4</sup>Department of Surgery and Cancer, Imperial College London, London, United Kingdom</div>
+<div id="aff5">
+<sup>5</sup>Centre for Integrative Systems Biology and Bioinformatics (CISBIO), Department of Life Sciences, Imperial College London, London, United Kingdom</div>
+<div class="author-notes p">
+<div class="fn" id="fn1"><p>Edited by: Patrick Ruch, Geneva School of Business Administration, Switzerland</p></div>
+<div class="fn" id="fn2"><p>Reviewed by: Denis Newman-Griffis, University of Pittsburgh, United States; Ceyda Kasavi, Marmara University, Turkey</p></div>
+<div class="fn" id="c001">
+<sup>✉</sup><p class="display-inline">*Correspondence: Tim Beck <span>tb143@leicester.ac.uk</span></p>
+</div>
+<div class="fn" id="c002">
+<sup>*</sup><p class="display-inline">Joram M. Posma <span>jmp111@ic.ac.uk</span></p>
+</div>
+<div class="fn" id="fn001"><p>This article was submitted to Health Informatics, a section of the journal Frontiers in Digital Health</p></div>
+<div class="fn" id="fn002"><p>†These authors have contributed equally to this work</p></div>
+</div>
+</div>
+<div id="anp_a" class="d-panel p" style="display: none"><div class="notes p"><section id="historyarticle-meta1" class="history"><p>Received 2021 Oct 1; Accepted 2022 Jan 21; Collection date 2022.</p></section></div></div>
+<div id="clp_a" class="d-panel p" style="display: none">
+<div>Copyright © 2022 Beck, Shorter, Hu, Li, Sun, Popovici, McQuibban, Makraduli, Yeung, Rowlands and Posma.</div>
+<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
+<div class="p"><a href="/about/copyright/" class="usa-link">PMC Copyright notice</a></div>
+</div>
+</div>
+<div>PMCID: PMC8885717  PMID: <a href="https://pubmed.ncbi.nlm.nih.gov/35243479/" class="usa-link">35243479</a>
+</div>
+</div></section></section><section aria-label="Article content"><section class="body main-article-body"><section class="abstract" id="abstract1"><h2>Abstract</h2>
+<p>To analyse large corpora using machine learning and other Natural Language Processing (NLP) algorithms, the corpora need to be standardized. The BioC format is a community-driven simple data structure for sharing text and annotations, however there is limited access to biomedical literature in BioC format and a lack of bioinformatics tools to convert online publication HTML formats to BioC. We present Auto-CORPus (Automated pipeline for Consistent Outputs from Research Publications), a novel NLP tool for the standardization and conversion of publication HTML and table image files to three convenient machine-interpretable outputs to support biomedical text analytics. Firstly, Auto-CORPus can be configured to convert HTML from various publication sources to BioC. To standardize the description of heterogenous publication sections, the Information Artifact Ontology is used to annotate each section within the BioC output. Secondly, Auto-CORPus transforms publication tables to a JSON format to store, exchange and annotate table data between text analytics systems. The BioC specification does not include a data structure for representing publication table data, so we present a JSON format for sharing table content and metadata. Inline tables within full-text HTML files and linked tables within separate HTML files are processed and converted to machine-interpretable table JSON format. Finally, Auto-CORPus extracts abbreviations declared within publication text and provides an abbreviations JSON output that relates an abbreviation with the full definition. This abbreviation collection supports text mining tasks such as named entity recognition by including abbreviations unique to individual publications that are not contained within standard bio-ontologies and dictionaries. The Auto-CORPus package is freely available with detailed instructions from GitHub at: <a href="https://github.com/omicsNLP/Auto-CORPus" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://github.com/omicsNLP/Auto-CORPus</a>.</p>
+<section id="kwd-group1" class="kwd-group"><p><strong>Keywords:</strong> natural language processing, text mining, biomedical literature, semantics, health data</p></section></section><section id="s1"><h2 class="pmc_sec_title">Introduction</h2>
+<p>Natural language processing (NLP) is a branch of artificial intelligence that uses computers to process, understand, and use human language. NLP is applied in many different fields including language modeling, speech recognition, text mining, and translation systems. In the biomedical realm NLP has been applied to extract, for example, medication data from electronic health records and patient clinical history from free-text (unstructured) clinical notes, to significantly speed up processes that would otherwise be extracted manually by experts (<a href="#B1" class="usa-link" aria-describedby="B1">1</a>, <a href="#B2" class="usa-link" aria-describedby="B2">2</a>). Biomedical research publications, although semi-structured, pose similar challenges with regards to extracting and integrating relevant information (<a href="#B3" class="usa-link" aria-describedby="B3">3</a>). The full-text of biomedical literature is predominately made available online in the accessible and reusable HTML format, however, some publications are only available as PDF documents which are more difficult to reuse. Efforts to resolve the problem of publication text accessibility across science in general includes work by the Semantic Scholar search engine to convert PDF documents to HTML formats (<a href="#B4" class="usa-link" aria-describedby="B4">4</a>). Whichever process is used to obtain a suitable HTML file, before the text can be processed using NLP, heterogeneously structured HTML requires standardization and optimization. BioC is a simple JSON (and XML) format for sharing and reusing text data that has been developed by the text mining community to improve system interoperability (<a href="#B5" class="usa-link" aria-describedby="B5">5</a>). The BioC data model consists of collections of documents divided into data elements such as publication sections and associated entity and relation annotations. PubMed Central (PMC) makes full-text articles from its Open Access and Author Manuscript collections available in BioC format (<a href="#B6" class="usa-link" aria-describedby="B6">6</a>). To our knowledge there are no services available to convert PMC publications that are not part of these collections to BioC. Additionally, there is a gap in available software to convert publishers' publication HTML to BioC, creating a bottleneck in many biomedical literature text mining workflows caused by having to process documents in heterogenous formats. To bridge this gap, we have developed an Automated pipeline for Consistent Outputs from Research Publications (Auto-CORPus) that can be configured to process any HTML publication structure and transform the corresponding publications to BioC format.</p>
+<p>During information extraction, the publication section context of an entity will assist with entity prioritization. For example, an entity identified in the Results Section may be regarded as a higher priority novel finding than one identified in the Introduction Section. However, the naming and the sequential order of sections within research articles differ between publications. A Methods section, for example, may be found at different locations relative to other sections and identified using a range of synonyms such as <em>experimental section, experimental procedures</em>, and <em>methodology</em>. The Information Artifact Ontology (IAO) was created to serve as a domain-neutral resource for the representation of types of information content entities such as documents, databases, and digital images (<a href="#B7" class="usa-link" aria-describedby="B7">7</a>). Auto-CORPus applies IAO annotations to BioC file outputs to standardize the description of sections across all processed publications.</p>
+<p>Vast amounts of biomedical data are contained in publication tables which can be large and multi-dimensional where information beyond a standard two-dimensional matrix is conveyed to a human reader. For example, a table may have subsections or entirely new column headers to merge multiple tables into a single structure. Milosevic and colleagues developed a methodology to analyse complex tables that are represented in XML format and perform a semantic analysis to classify the data types used within a table (<a href="#B8" class="usa-link" aria-describedby="B8">8</a>). The outputs from the table analysis are stored in esoteric XML or database models. The communal BioC format on the other hand has limited support for tables, for example the PMC BioC JSON output includes table data in PMC XML format, introducing file parsing complexity. In addition to variations in how tables are structured, there is variability amongst table filetypes. Whereas, publication full-text is contained within a single HTML file, tables may be contained within that full-text file (inline tables), or individual tables may be contained in separate HTML files (linked tables). We have defined a dedicated table JSON format for representing table data from both formats of table. The contents of individual cells are unambiguously identified and thus can be used in entity and relation annotations. In developing the Auto-CORPus table JSON format, we adopted a similar goal to the BioC community, namely, a simple format to maximize interoperability and reuse of table documents and annotations. The table JSON reuses the BioC data model for entity and relation annotations, ensuring that table and full-text annotations can share the same BioC syntax. Auto-CORPus transforms both inline and linked HTML tables to the machine interpretable table JSON format.</p>
+<p>Abbreviations and acronyms are widely used in publication text to reduce space and avoid prolix. Abbreviations and their definitions are useful in text mining to identify lexical variations of words describing identical entities. However, the frequent use of novel abbreviations in texts presents a challenge for the curators of biomedical lexical ontologies to ensure they are continually updated. Several algorithms have been developed to extract abbreviations and their definitions from biomedical text (<a href="#B9" class="usa-link" aria-describedby="B9">9</a>–<a href="#B11" class="usa-link" aria-describedby="B11">11</a>). Abbreviations within publications can be defined when they are declared within the full-text, and in some publications, are included in a dedicated <em>abbreviations</em> section. Auto-CORPus adapts an abbreviation detecting methodology (<a href="#B12" class="usa-link" aria-describedby="B12">12</a>) and couples it with IAO section detection to comprehensively extract abbreviations declared in the full-text and in the <em>abbreviations</em> section. For each publication, Auto-CORPus generates an abbreviations dictionary JSON file.</p>
+<p>The aim of this article is to describe the open Auto-CORPus python package and the text mining use cases that make it a simple user-friendly application to create machine interpretable biomedical literature files, from a single publication to a large corpus. The authors share the common interest of progressing text mining capabilities across the biomedical literature domain and contribute omics and health data use cases related to their expertise in Genome-Wide Association Study (GWAS) and Metabolome-Wide Association Study (MWAS) data integration and analytics (see Author Contributions Section). The following sections describe the technical details about the algorithms developed and the benchmarking undertaken to assess the quality of the three Auto-CORPus outputs generated for each publication: BioC full-text, Auto-CORPus tables, and Auto-CORPus abbreviations JSON files.</p></section><section id="s2"><h2 class="pmc_sec_title">Materials and Methods</h2>
+<section id="sec3"><h3 class="pmc_sec_title">Data for Algorithm Development</h3>
+<p>We used a set of 3,279 full-text HTML and 1,041 linked table files to develop and test the algorithms described in this section. Files for 1,200 Open Access (OA) GWAS publications whose data exists in the GWAS Central database (<a href="#B13" class="usa-link" aria-describedby="B13">13</a>) were downloaded from PMC in March 2020. A further 1,241 OA PMC publications of MWAS and metabolomics studies on cancer, gastrointestinal diseases, metabolic syndrome, sepsis and neurodegenerative, psychiatric, and brain illnesses were also downloaded to ensure the methods are not biased toward one domain, more information is available in the <a href="#SM1" class="usa-link">Supplementary Material</a>. This formed a collection of 2,441 publications that will be referred to as the “OA dataset.” We also downloaded publisher-specific full-text files, and linked table data were available, for publications whose data exists in the GWAS Central database. This collection of 838 full-text and 1,041 table HTML files will be referred to as the “publisher dataset.” <a href="#T1" class="usa-link">Table 1</a> lists the publishers and journals included in the publisher dataset and the number of publications that overlap with the OA dataset. This also includes publications from non-biomedical fields to evaluate the application in other domains.</p>
+<section class="tw xbox font-sm" id="T1"><h4 class="obj_head">Table 1.</h4>
+<div class="caption p"><p>Publishers and journals included in the publisher dataset.</p></div>
+<div class="tbl-box p" tabindex="0"><table class="content" frame="hsides" rules="groups">
+<thead><tr>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Publisher</strong>
+</th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Journal</strong>
+</th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Number of full-text files</strong>
+</th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Overlap with OA dataset</strong>
+</th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Table type</strong>
+</th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Number of table files</strong>
+</th>
+</tr></thead>
+<tbody>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">American Heart Association</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Circulation Cardiovascular Genetics</td>
+<td valign="top" align="left" rowspan="1" colspan="1">52</td>
+<td valign="top" align="left" rowspan="1" colspan="1">39</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">American Physical Society</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Physical Review Letters<a href="#TN1" class="usa-link"><sup>a</sup></a>
+</td>
+<td valign="top" align="left" rowspan="1" colspan="1">6</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">American Psychological Association</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Psychological Bulletin<a href="#TN1" class="usa-link"><sup>a</sup></a>
+</td>
+<td valign="top" align="left" rowspan="1" colspan="1">3</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">American Society of Hematology</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Blood</td>
+<td valign="top" align="left" rowspan="1" colspan="1">31</td>
+<td valign="top" align="left" rowspan="1" colspan="1">25</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">American Thoracic Society</td>
+<td valign="top" align="left" rowspan="1" colspan="1">American Journal of Respiratory and Critical Care Medicine</td>
+<td valign="top" align="left" rowspan="1" colspan="1">20</td>
+<td valign="top" align="left" rowspan="1" colspan="1">18</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">BioMed Central</td>
+<td valign="top" align="left" rowspan="1" colspan="1">BMC Medical Genetics</td>
+<td valign="top" align="left" rowspan="1" colspan="1">43</td>
+<td valign="top" align="left" rowspan="1" colspan="1">43</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked HTML</td>
+<td valign="top" align="left" rowspan="1" colspan="1">160</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Cell Press</td>
+<td valign="top" align="left" rowspan="1" colspan="1">American Journal of Human Genetics</td>
+<td valign="top" align="left" rowspan="1" colspan="1">5</td>
+<td valign="top" align="left" rowspan="1" colspan="1">5</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Elsevier</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Biological Psychiatry</td>
+<td valign="top" align="left" rowspan="1" colspan="1">5</td>
+<td valign="top" align="left" rowspan="1" colspan="1">5</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">Gastroenterology</td>
+<td valign="top" align="left" rowspan="1" colspan="1">5</td>
+<td valign="top" align="left" rowspan="1" colspan="1">2</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Frontiers</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Frontiers in Genetics</td>
+<td valign="top" align="left" rowspan="1" colspan="1">20</td>
+<td valign="top" align="left" rowspan="1" colspan="1">20</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked images</td>
+<td valign="top" align="left" rowspan="1" colspan="1">n/a</td>
+</tr>
+<tr>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">Frontiers in Physics<a href="#TN1" class="usa-link"><sup>a</sup></a>
+</td>
+<td valign="top" align="left" rowspan="1" colspan="1">3</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">Frontiers in Psychology<a href="#TN1" class="usa-link"><sup>a</sup></a>
+</td>
+<td valign="top" align="left" rowspan="1" colspan="1">4</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Massachusetts Medical Society</td>
+<td valign="top" align="left" rowspan="1" colspan="1">The New England Journal of Medicine</td>
+<td valign="top" align="left" rowspan="1" colspan="1">20</td>
+<td valign="top" align="left" rowspan="1" colspan="1">12</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked images</td>
+<td valign="top" align="left" rowspan="1" colspan="1">n/a</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Mosby</td>
+<td valign="top" align="left" rowspan="1" colspan="1">The Journal of Allergy and Clinical Immunology</td>
+<td valign="top" align="left" rowspan="1" colspan="1">5</td>
+<td valign="top" align="left" rowspan="1" colspan="1">3</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Nature Portfolio</td>
+<td valign="top" align="left" rowspan="1" colspan="1">European Journal of Human Genetics</td>
+<td valign="top" align="left" rowspan="1" colspan="1">50</td>
+<td valign="top" align="left" rowspan="1" colspan="1">50</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked HTML</td>
+<td valign="top" align="left" rowspan="1" colspan="1">123</td>
+</tr>
+<tr>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">Journal of Human Genetics</td>
+<td valign="top" align="left" rowspan="1" colspan="1">37</td>
+<td valign="top" align="left" rowspan="1" colspan="1">3</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked HTML</td>
+<td valign="top" align="left" rowspan="1" colspan="1">90</td>
+</tr>
+<tr>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">Molecular Psychiatry</td>
+<td valign="top" align="left" rowspan="1" colspan="1">103</td>
+<td valign="top" align="left" rowspan="1" colspan="1">78</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked HTML</td>
+<td valign="top" align="left" rowspan="1" colspan="1">262</td>
+</tr>
+<tr>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">Nature Physics<a href="#TN1" class="usa-link"><sup>a</sup></a>
+</td>
+<td valign="top" align="left" rowspan="1" colspan="1">3</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">Scientific Reports</td>
+<td valign="top" align="left" rowspan="1" colspan="1">80</td>
+<td valign="top" align="left" rowspan="1" colspan="1">80</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked HTML</td>
+<td valign="top" align="left" rowspan="1" colspan="1">190</td>
+</tr>
+<tr>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">The Pharmacogenomics Journal</td>
+<td valign="top" align="left" rowspan="1" colspan="1">37</td>
+<td valign="top" align="left" rowspan="1" colspan="1">16</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked HTML</td>
+<td valign="top" align="left" rowspan="1" colspan="1">116</td>
+</tr>
+<tr>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">Translational Psychiatry</td>
+<td valign="top" align="left" rowspan="1" colspan="1">41</td>
+<td valign="top" align="left" rowspan="1" colspan="1">41</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked HTML</td>
+<td valign="top" align="left" rowspan="1" colspan="1">87</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Oxford University Press</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Human Molecular Genetics</td>
+<td valign="top" align="left" rowspan="1" colspan="1">254</td>
+<td valign="top" align="left" rowspan="1" colspan="1">186</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">PLOS</td>
+<td valign="top" align="left" rowspan="1" colspan="1">PLOS One</td>
+<td valign="top" align="left" rowspan="1" colspan="1">20</td>
+<td valign="top" align="left" rowspan="1" colspan="1">20</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked images</td>
+<td valign="top" align="left" rowspan="1" colspan="1">n/a</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">SAGE Publications</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Psychological Science<a href="#TN1" class="usa-link"><sup>a</sup></a>
+</td>
+<td valign="top" align="left" rowspan="1" colspan="1">3</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">-</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Springer</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Human Genetics</td>
+<td valign="top" align="left" rowspan="1" colspan="1">5</td>
+<td valign="top" align="left" rowspan="1" colspan="1">2</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Linked HTML</td>
+<td valign="top" align="left" rowspan="1" colspan="1">13</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Wiley-Blackwell</td>
+<td valign="top" align="left" rowspan="1" colspan="1">American Journal of Medical Genetics</td>
+<td valign="top" align="left" rowspan="1" colspan="1">5</td>
+<td valign="top" align="left" rowspan="1" colspan="1">0</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Inline</td>
+<td valign="top" align="left" rowspan="1" colspan="1">–</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Total</td>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">860</td>
+<td valign="top" align="left" rowspan="1" colspan="1">648</td>
+<td rowspan="1" colspan="1"></td>
+<td valign="top" align="left" rowspan="1" colspan="1">1,041</td>
+</tr>
+</tbody>
+</table></div>
+<div class="p text-right font-secondary"><a href="table/T1/" class="usa-link" target="_blank" rel="noopener noreferrer">Open in a new tab</a></div>
+<div class="tw-foot p">
+<div class="fn" id="_fn_p13"><p><em>The full-text files were downloaded in HTML format and the linked table files were downloaded when available in HTML formats. The full-text files that overlap with the OA dataset were used to assess the consistency of outputs generated from different sources</em>.</p></div>
+<div class="fn" id="TN1">
+<sup>a</sup><p class="display-inline"><em>These publications are not part of the publisher dataset for evaluating tables, but are used for evaluating the accuracy of IAO header mapping</em>.</p>
+</div>
+</div></section></section><section id="sec4"><h3 class="pmc_sec_title">Algorithms for Processing Publication Full-Text HTML</h3>
+<p>An Auto-CORPus configuration file is set by the user to define the heading and paragraph HTML elements used in the publication files to be processed. Regular expressions can be used within the configuration file allowing a group of publications with a similar but not an identical structure to be defined by a single configuration file, for example when processing publications from journals by the same publisher. The heading elements are used to delineate the content of the publication sections and the BioC data structure is populated with publication text. All HTML tags including text formatting (e.g., emphasized words, superscript, and subscript) are removed from the publication text. Each section is automatically annotated using IAO (see Section Algorithms for Classifying Publication Sections With IAO Terms) and the BioC data structure is output in JSON format. The BioC specification requires “key files” to accompany BioC data files to specify how the data files should be interpreted (<a href="#B5" class="usa-link" aria-describedby="B5">5</a>). We provide key files to define the data elements in the Auto-CORPus JSON output files for full-text, tables, and abbreviations (<a href="https://github.com/omicsNLP/Auto-CORPus/tree/main/keyFiles" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://github.com/omicsNLP/Auto-CORPus/tree/main/keyFiles</a>). <a href="#F1" class="usa-link">Figure 1</a> gives an example of the BioC JSON output and the abbreviations and tables outputs are described below.</p>
+<figure class="fig xbox font-sm" id="F1"><h4 class="obj_head">Figure 1.</h4>
+<p class="img-box line-height-none margin-x-neg-2 tablet:margin-x-0 text-center"><a class="tileshop" target="_blank" href="https://www.ncbi.nlm.nih.gov/core/lw/2.0/html/tileshop_pmc/tileshop_pmc_inline.html?title=Click%20on%20image%20to%20zoom&amp;p=PMC3&amp;id=8885717_fdgth-04-788124-g0001.jpg"><img class="graphic zoom-in" src="https://cdn.ncbi.nlm.nih.gov/pmc/blobs/a589/8885717/65f397c7e326/fdgth-04-788124-g0001.jpg" loading="lazy" height="493" width="708" alt="Figure 1"></a></p>
+<div class="p text-right font-secondary"><a href="figure/F1/" class="usa-link" target="_blank" rel="noopener noreferrer">Open in a new tab</a></div>
+<figcaption><p>An extract of the Auto-CORPus BioC JSON created from the PMC3606015 full-text HTML file. Each section is annotated with IAO terms. The “autocorpus_fulltext.key” file describes the contents of the full-text JSON file (<a href="https://github.com/omicsNLP/Auto-CORPus/blob/main/keyFiles/autocorpus_fulltext.key" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://github.com/omicsNLP/Auto-CORPus/blob/main/keyFiles/autocorpus_fulltext.key</a>).</p></figcaption></figure><p>Abbreviations in the full-text are found using an adaptation of a previously published methodology and implementation (<a href="#B12" class="usa-link" aria-describedby="B12">12</a>). The method finds all brackets within a publication and if there are two or more non-digit characters within brackets it considers if the string in the brackets could be an abbreviation. It searches for the characters present in the brackets in the text on either side of the brackets one by one. The first character of one of these words must contain the first character within the bracket, and the other characters within that bracket must be contained by other words that follow the first word whose first character is the same as the first character in that bracket. An example of the Auto-CORPus abbreviations JSON is given in <a href="#F2" class="usa-link">Figure 2</a> which shows that the output from this algorithm is stored along with the abbreviations defined in the publication abbreviations section (if present).</p>
+<figure class="fig xbox font-sm" id="F2"><h4 class="obj_head">Figure 2.</h4>
+<p class="img-box line-height-none margin-x-neg-2 tablet:margin-x-0 text-center"><a class="tileshop" target="_blank" href="https://www.ncbi.nlm.nih.gov/core/lw/2.0/html/tileshop_pmc/tileshop_pmc_inline.html?title=Click%20on%20image%20to%20zoom&amp;p=PMC3&amp;id=8885717_fdgth-04-788124-g0002.jpg"><img class="graphic zoom-in" src="https://cdn.ncbi.nlm.nih.gov/pmc/blobs/a589/8885717/7ef12cdee15d/fdgth-04-788124-g0002.jpg" loading="lazy" height="913" width="708" alt="Figure 2"></a></p>
+<div class="p text-right font-secondary"><a href="figure/F2/" class="usa-link" target="_blank" rel="noopener noreferrer">Open in a new tab</a></div>
+<figcaption><p>An extract from the Auto-CORPus abbreviations JSON created from the PMC4068805 full-text HTML file. For each abbreviation the corresponding long form definition is given along with the algorithm(s) used to detect the abbreviation. Most of the abbreviations shown were independently identified in both the full-text and in the abbreviations section of the publication. A variation in the definition of “RP” was detected: in the abbreviations section this was defined as “reverse phase,” however in the full-text this was defined as “reversed phase.” The “autocorpus_abbreviations.key” file describes the contents of the abbreviations JSON file (<a href="https://github.com/omicsNLP/Auto-CORPus/blob/main/keyFiles/autocorpus_abbreviations.key" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://github.com/omicsNLP/Auto-CORPus/blob/main/keyFiles/autocorpus_abbreviations.key</a>).</p></figcaption></figure></section><section id="sec5"><h3 class="pmc_sec_title">Algorithms for Classifying Publication Sections With IAO Terms</h3>
+<p>A total of 21,849 section headers were extracted from the OA dataset and directed path graphs (DPGs) were created for each publication (<a href="#F3" class="usa-link">Figure 3</a>). DPGs are a linear chain without any cycles. For example, at this point in this article the main headers are <em>abstract</em> (one paragraph) followed by <em>introduction</em> (five paragraphs) and <em>materials and methods</em> (four paragraphs, three sub-headers)—this would make up a DPG with three nodes (<em>abstract, introduction, materials and methods</em>) and two directed edges. For our <em>Introduction Section</em>, while the individual five paragraphs within a section would all be mapped to the main header (<em>introduction</em>), only one node would appear in the DPG (relating to the header itself) without any self-edges. The individual DPGs were then combined into a directed graph (digraph, <a href="#SM1" class="usa-link">Supplementary Figure 2</a>) and the extracted section headers were mapped to IAO (v2020-06-10) <em>document part</em> terms using the Lexical OWL Ontology Matcher (LOOM) method (<a href="#B14" class="usa-link" aria-describedby="B14">14</a>). Fuzzy matching using the fuzzywuzzy python package (v0.17.0) was then used to map headers to the preferred section header terms and synonyms, with a similarity threshold of 0.8 (e.g., the typographical error “experemintal section” in PMC4286171 is correctly mapped to <em>methods section</em>). This threshold was evaluated by two independent researchers who confirmed all matches for the OA dataset were accurate. Digraphs consist of nodes (entities, headers) and edges (links between nodes) and the weight of the nodes and edges is proportional to the number of publications in which these are found. Here the digraph consists of 372 unique nodes and 806 directed edges (<a href="#SM1" class="usa-link">Supplementary Figure 1</a>).</p>
+<figure class="fig xbox font-sm" id="F3"><h4 class="obj_head">Figure 3.</h4>
+<p class="img-box line-height-none margin-x-neg-2 tablet:margin-x-0 text-center"><a class="tileshop" target="_blank" href="https://www.ncbi.nlm.nih.gov/core/lw/2.0/html/tileshop_pmc/tileshop_pmc_inline.html?title=Click%20on%20image%20to%20zoom&amp;p=PMC3&amp;id=8885717_fdgth-04-788124-g0003.jpg"><img class="graphic zoom-in" src="https://cdn.ncbi.nlm.nih.gov/pmc/blobs/a589/8885717/0bb081de02ac/fdgth-04-788124-g0003.jpg" loading="lazy" height="179" width="708" alt="Figure 3"></a></p>
+<div class="p text-right font-secondary"><a href="figure/F3/" class="usa-link" target="_blank" rel="noopener noreferrer">Open in a new tab</a></div>
+<figcaption><p>Flow diagram demonstrating the process of classifying publication sections with IAO terms. The unfiltered digraph is visualized in <a href="#SM1" class="usa-link">Supplementary Figure 1</a>, and the process of combining DPGs and mapping unmapped nodes using anchor points in <a href="#SM1" class="usa-link">Supplementary Figure 2</a>. DPG, directed path graph; G(V,E), graph(vertex, edge); IAO, information artifact ontology.</p></figcaption></figure><p>However, after direct IAO mapping and fuzzy matching, unmapped headers still existed. To map these headings, we developed a new method using both the digraph and the individual DPGs. The headers are not repeated within a document/DPG, they are sequential/a chain and have a set order that can be exploited. Unmapped headers are assigned a section based on the digraph and the headers in the publication (DPG) that could be mapped (anchor headers), an example is given in <a href="#SM1" class="usa-link">Supplementary Figure 2</a> where a header cannot be mapped to IAO terms. Any unmapped header that is mapped to an existing IAO term in this manner does not result in a self-edge in the network as subsequent repeated headers are collapsed into a single node. Auto-CORPus uses the LOOM, fuzzy matching and digraph prediction algorithms to annotate publication sections with IAO terms in the BioC full-text file. Paragraphs can be mapped to multiple IAO terms in case of publications without main-text headers (based on digraph prediction) or with ambiguous headers (based on fuzzy matching and/or digraph prediction).</p>
+<section id="sec6"><h4 class="pmc_sec_title">New IAO Terms and Synonyms</h4>
+<p>We used the IAO classification algorithms to identify potential new IAO terms and synonyms. Three hundred and forty-eight headings from the OA dataset were mapped to IAO terms during the fuzzy matching or mapped based on the digraph using the publication structure and anchor headers. These headings were considered for inclusion in IAO as term synonyms. We manually evaluated each heading and <a href="#T2" class="usa-link">Table 2</a> lists the 94 synonyms we identified for existing IAO terms.</p>
+<section class="tw xbox font-sm" id="T2"><h5 class="obj_head">Table 2.</h5>
+<div class="caption p"><p>New synonyms identified for existing IAO terms from the fuzzy and digraph mappings of 2,441 publications.</p></div>
+<div class="tbl-box p" tabindex="0"><table class="content" frame="hsides" rules="groups">
+<thead><tr>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Category (IAO identifier)</strong>
+</th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Existing synonyms<a href="#TN2" class="usa-link"><sup><strong>a</strong></sup></a></strong>
+</th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>New synonyms identified<a href="#TN3" class="usa-link"><sup><strong>b</strong></sup></a></strong>
+</th>
+</tr></thead>
+<tbody>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">abbreviations (IAO:0000606)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">abbreviations, abbreviations list, abbreviations used, list of abbreviations, list of abbreviations used</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>abbreviation and acronyms, abbreviation list, abbreviations and acronyms</em>, abbreviations used in this paper, <em>definitions for abbreviations</em>, glossary, key abbreviations, <em>non-standard abbreviations, nonstandard abbreviations, nonstandard abbreviations and acronyms</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">abstract (IAO:0000315)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">abstract</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>precis</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">acknowledgments (IAO:0000324)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">acknowledgments, acknowledgments</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>acknowledgment, acknowledgment</em>, acknowledgments and disclaimer</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">author contributions (IAO:0000323)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">author contributions, contributions by the authors</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>authors' contribution, authors' contributions, authors' roles, contributorship</em>, main authors by consortium and author contributions</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">author information (IAO:0000607)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">author information, authors' information</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>biographies, contributor information</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">availability (IAO:0000611)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">availability, availability and requirements</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>availability of data</em>, availability of data and materials, <em>data archiving, data availability, data availability statement, data sharing statement</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">conclusion (IAO:0000615)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">concluding remarks, conclusion, conclusions, findings, summary</td>
+<td valign="top" align="left" rowspan="1" colspan="1">conclusion and perspectives, summary and conclusion</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">conflict of interest (IAO:0000616)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">competing interests, conflict of interest, conflict of interest statement, declaration of competing interests, disclosure of potential conflicts of interest</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>authors' disclosures of potential conflicts of interest, competing financial interests, conflict of interests, conflicts of interest, declaration of competing interest, declaration of interest, declaration of interests, disclosure of conflict of interest, duality of interest, statement of interest</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">consent (IAO:0000618)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">consent</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Informed consent</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">discussion (IAO:0000319)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">discussion, discussion section</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>discussions</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">ethical approval (IAO:0000620)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">ethical approval</td>
+<td valign="top" align="left" rowspan="1" colspan="1">ethics approval and consent to participate, <em>ethical requirements, ethics, ethics statement</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">footnote (IAO:0000325)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">endnote, footnote</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>footnotes</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">funding source declaration (IAO:0000623)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">funding, funding information, funding sources, funding statement, funding/support, source of funding, sources of funding</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>financial support, grants, role of the funding source, study funding</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">future directions (IAO:0000625)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">future challenges, future considerations, future developments, future directions, future outlook, future perspectives, future plans, future prospects, future research, future research directions, future studies, future work</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>outlook</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">introduction (IAO:0000316)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">background, introduction</td>
+<td valign="top" align="left" rowspan="1" colspan="1">introductory paragraph</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">materials (IAO:0000633)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">materials</td>
+<td valign="top" align="left" rowspan="1" colspan="1">data, data description</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">methods (IAO:0000317)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">experimental, experimental procedures, experimental section, materials and methods, methods</td>
+<td valign="top" align="left" rowspan="1" colspan="1">analytical methods, concise methods, <em>experimental methods, method</em>, method validation, <em>methodology</em>, methods and design, methods and procedures, methods and tools, methods/design, online methods, star methods, study design, study design and methods</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">references (IAO:0000320)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">bibliography, literature cited, references</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>literature cited, reference, references, reference list</em>, selected references, web site references</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">statistical analysis (IAO:0000644)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">statistical analysis</td>
+<td valign="top" align="left" rowspan="1" colspan="1">statistical methods, statistical methods and analysis, statistics</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">study limitations (IAO:0000631)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">limitations, study limitations</td>
+<td valign="top" align="left" rowspan="1" colspan="1">strengths and limitations, study strengths and limitations</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">supplementary material (IAO:0000326)</td>
+<td valign="top" align="left" rowspan="1" colspan="1">additional information, appendix, supplemental information, supplementary material, supporting information</td>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>additional file, additional files</em>, additional information and declarations, additional points, <em>electronic supplementary material, electronic supplementary materials</em>, online content, <em>supplemental data, supplemental material, supplementary data</em>, supplementary figures and tables, <em>supplementary files, supplementary information, supplementary materials</em>, supplementary materials figures, supplementary materials figures and tables, supplementary materials table, supplementary materials tables</td>
+</tr>
+</tbody>
+</table></div>
+<div class="p text-right font-secondary"><a href="table/T2/" class="usa-link" target="_blank" rel="noopener noreferrer">Open in a new tab</a></div>
+<div class="tw-foot p">
+<div class="fn" id="TN2">
+<sup>a</sup><p class="display-inline"><em>IAO v2020-06-10</em>.</p>
+</div>
+<div class="fn" id="TN3">
+<sup>b</sup><p class="display-inline"><em>Elements in italics have previously been submitted by us for inclusion into IAO and added in the v2020-12-09 IAO release</em>.</p>
+</div>
+</div></section><p>Diagraph nodes that were not mapped to IAO terms but formed heavily weighted “ego-networks,” indicating the same heading was found in many publications, were manually evaluated for inclusion in IAO as new terms. For example, based on the digraph, we assigned <em>data</em> and <em>data description</em> to be synonyms of the <em>materials section</em>. The same process was applied to ego-networks from other nodes linked to existing IAO terms to add additional synonyms to simplify the digraph. <a href="#F4" class="usa-link">Figure 4</a> shows the ego-network for <em>abstract</em>, and four main categories and one potential new synonym (<em>precis</em>, in red) were identified. From the further analysis of all ego-networks, four new potential terms were identified: <em>disclosure, graphical abstract, highlights</em>, and <em>participants</em>—the latter is related to, but deemed distinct from, the existing <em>patients section</em> (IAO:0000635). <a href="#T3" class="usa-link">Table 3</a> details the proposed definition and synonyms for these terms. The terms and synonyms described here will be submitted to the IAO, with our initial submission of one term and 59 synonyms accepted and included in IAO previously (v2020-12-09) (<a href="https://github.com/information-artifact-ontology/IAO/issues/234" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://github.com/information-artifact-ontology/IAO/issues/234</a>). <a href="#F5" class="usa-link">Figure 5</a> shows the resulting digraph with only existing and newly proposed section terms. A major unmapped node is <em>associated data</em>, which is a header specific for PMC articles that appears at the beginning of each article before the abstract. In addition, IAO has separate definitions for <em>materials</em> (IAO:0000633), <em>methods</em> (IAO:0000317), and <em>statistical methods</em> (IAO:0000644) sections, hence they are separate nodes in the graph. The <em>introduction</em> is often followed by these headers to reflect the <em>methods section</em> (and synonyms), however there is also a major directed edge from <em>introduction</em> directly to <em>results</em> to account for <em>materials and methods</em> placed after the <em>discussion</em> and/or <em>conclusion</em> sections in some publications.</p>
+<figure class="fig xbox font-sm" id="F4"><h5 class="obj_head">Figure 4.</h5>
+<p class="img-box line-height-none margin-x-neg-2 tablet:margin-x-0 text-center"><a class="tileshop" target="_blank" href="https://www.ncbi.nlm.nih.gov/core/lw/2.0/html/tileshop_pmc/tileshop_pmc_inline.html?title=Click%20on%20image%20to%20zoom&amp;p=PMC3&amp;id=8885717_fdgth-04-788124-g0004.jpg"><img class="graphic zoom-in" src="https://cdn.ncbi.nlm.nih.gov/pmc/blobs/a589/8885717/4792acdb47bf/fdgth-04-788124-g0004.jpg" loading="lazy" height="357" width="708" alt="Figure 4"></a></p>
+<div class="p text-right font-secondary"><a href="figure/F4/" class="usa-link" target="_blank" rel="noopener noreferrer">Open in a new tab</a></div>
+<figcaption><p>Unmapped nodes in the digraph (<a href="#F3" class="usa-link">Figure 3</a>) connected to “abstract” as ego node, excluding corpus specific nodes, grouped into different categories. Unlabeled nodes are titles of paragraphs in the main text.</p></figcaption></figure><section class="tw xbox font-sm" id="T3"><h5 class="obj_head">Table 3.</h5>
+<div class="caption p"><p><strong>(A)</strong> Proposed new IAO terms to define publication sections that were derived from analyzing the sections of 2,441 publications. <strong>(B)</strong> Proposed new IAO terms to define parts of a table section. Elements in italics have previously been submitted by us for inclusion into IAO and added in the v2020-12-09 IAO release.</p></div>
+<div class="tbl-box p" tabindex="0"><table class="content" frame="hsides" rules="groups">
+<thead><tr>
+<th rowspan="1" colspan="1"></th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Proposed definition</strong>
+</th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Proposed synonyms</strong>
+</th>
+</tr></thead>
+<tbody>
+<tr><td valign="top" align="left" colspan="3" rowspan="1">
+<strong>(A) Proposed category</strong>
+</td></tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Disclosure</td>
+<td valign="top" align="left" rowspan="1" colspan="1">“A part of a document used to disclose any associations by authors that might be perceived as to potentially interfere with or prevent them from reporting research with complete objectivity.”</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Author disclosure statement, declarations, disclosure, disclosure statement, disclosures</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">
+<em>Graphical abstract</em>
+</td>
+<td valign="top" align="left" rowspan="1" colspan="1">“<em>An abstract that is a pictorial summary of the main findings described in a document</em>.”</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Central illustration, <em>graphical abstract</em>, TOC image, <em>visual abstract</em>
+</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Highlights</td>
+<td valign="top" align="left" rowspan="1" colspan="1">“A short collection of key messages that describe the core findings and essence of the article in concise form. It is distinct and separate from the abstract and only conveys the results and concept of a study. It is devoid of jargon, acronyms and abbreviations and targeted at a broader, non-technical audience.”</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Author summary, editors' summary, highlights, key points, overview, research in context, significance, TOC</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Participants</td>
+<td valign="top" align="left" rowspan="1" colspan="1">“A section describing the recruitment of subjects into a research study. This section is distinct from the ‘patients' section and mostly focusses on healthy volunteers.”</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Participants, sample</td>
+</tr>
+<tr><td valign="top" align="left" colspan="3" rowspan="1">
+<strong>(B) Proposed category</strong>
+</td></tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Table title</td>
+<td valign="top" align="left" rowspan="1" colspan="1">“A textual entity that names a table.”</td>
+<td rowspan="1" colspan="1"></td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Table caption</td>
+<td valign="top" align="left" rowspan="1" colspan="1">“A textual entity that describes a table.”</td>
+<td rowspan="1" colspan="1"></td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Table footer</td>
+<td valign="top" align="left" rowspan="1" colspan="1">“A part of a table that provides additional information about a specific other part of the table. Footers are spatially segregated from the rest of the table and are usually indicated by a superscripted number or letter, or a special typographic character such as †.”</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Table key, table note, table notes</td>
+</tr>
+</tbody>
+</table></div>
+<div class="p text-right font-secondary"><a href="table/T3/" class="usa-link" target="_blank" rel="noopener noreferrer">Open in a new tab</a></div></section><figure class="fig xbox font-sm" id="F5"><h5 class="obj_head">Figure 5.</h5>
+<p class="img-box line-height-none margin-x-neg-2 tablet:margin-x-0 text-center"><a class="tileshop" target="_blank" href="https://www.ncbi.nlm.nih.gov/core/lw/2.0/html/tileshop_pmc/tileshop_pmc_inline.html?title=Click%20on%20image%20to%20zoom&amp;p=PMC3&amp;id=8885717_fdgth-04-788124-g0005.jpg"><img class="graphic zoom-in" src="https://cdn.ncbi.nlm.nih.gov/pmc/blobs/a589/8885717/25a36ed26300/fdgth-04-788124-g0005.jpg" loading="lazy" height="766" width="708" alt="Figure 5"></a></p>
+<div class="p text-right font-secondary"><a href="figure/F5/" class="usa-link" target="_blank" rel="noopener noreferrer">Open in a new tab</a></div>
+<figcaption><p>Final digraph model used in Auto-CORPus to classify paragraphs after fuzzy matching to IAO terms (v2020-06-10). This model includes new (proposed) section terms and each section contains new synonyms identified in this analysis. “Associated Data” is included as this is a PMC-specific header found before abstracts and can be used to indicate the start of most articles, all IAO terms are indicated in orange.</p></figcaption></figure></section></section><section id="sec7"><h3 class="pmc_sec_title">Algorithms for Processing Tables</h3>
+<section id="sec8"><h4 class="pmc_sec_title">Auto-CORPus Table JSON Design</h4>
+<p>The BioC format does not specify how table content should be structured, leaving this open to the interpretation of implementers. For example, the PMC BioC JSON output describes table content using PMC XML (see the “pmc.key” file at <a href="https://ftp.ncbi.nlm.nih.gov/pub/wilbur/BioC-PMC/pmc.key" class="usa-link" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://ftp.ncbi.nlm.nih.gov/pub/wilbur/BioC-PMC/pmc.key</a>). Including markup language within JSON objects presents data parsing challenges and interoperability barriers with non-PMC table data representations. We developed a simple table JSON format that is agnostic to the publication table source, can store multi-dimensional table content from complex table structures, and applies BioC design principles (<a href="#B5" class="usa-link" aria-describedby="B5">5</a>) to enable the annotation of entities and relations between entities. The table JSON stores table metadata of title, caption and footer. The table content is stored as “column headers” and “data rows.” The format supports the use of IAO to define the table metadata and content sections, however additional IAO terms are required to define table metadata document parts. <a href="#T3" class="usa-link">Table 3</a> includes the proposed definition and synonyms for these terms. To compensate for currently absent IAO terms, we have defined three section type labels: <em>table title, table caption</em> and <em>table footer</em>. To support the text mining of tables, each column header and data row cell has an identifier that can be used to identify entities in annotations. Tables can be arranged into subsections, thus the table JSON represents this and includes subsection headings. <a href="#F6" class="usa-link">Figure 6</a> gives an example of table metadata and content stored in the Auto-CORPus table JSON format. In addition to the Auto-CORPus key files, we make a table JSON schema available for the validation of table JSON files and to facilitate the use of the format in text analytics software and pipelines.</p>
+<figure class="fig xbox font-sm" id="F6"><h5 class="obj_head">Figure 6.</h5>
+<p class="img-box line-height-none margin-x-neg-2 tablet:margin-x-0 text-center"><a class="tileshop" target="_blank" href="https://www.ncbi.nlm.nih.gov/core/lw/2.0/html/tileshop_pmc/tileshop_pmc_inline.html?title=Click%20on%20image%20to%20zoom&amp;p=PMC3&amp;id=8885717_fdgth-04-788124-g0006.jpg"><img class="graphic zoom-in" src="https://cdn.ncbi.nlm.nih.gov/pmc/blobs/a589/8885717/4177c1bad6e9/fdgth-04-788124-g0006.jpg" loading="lazy" height="561" width="708" alt="Figure 6"></a></p>
+<div class="p text-right font-secondary"><a href="figure/F6/" class="usa-link" target="_blank" rel="noopener noreferrer">Open in a new tab</a></div>
+<figcaption><p>Extracts of the Auto-CORPus table JSON file generated to store metadata and content for an example table. <strong>(A)</strong> The parts of a table stored in table JSON. The section titles are underlined. The table shown is the PMC version (PMC4245044) of Table 1 from (<a href="#B15" class="usa-link" aria-describedby="B15">15</a>). <strong>(B)</strong> The title and caption table metadata stored in table JSON. <strong>(C)</strong> Each column heading in the table content is split between two rows, so the strings from both cells are concatenated with a pipe symbol in the table JSON. Headers that span multiple columns of sub-headers are replicated in each header cell as here with the pipe symbol. <strong>(D)</strong> The table content for the first row from the first section is shown in table JSON. Superscript characters are identified using HTML markup. <strong>(E)</strong> The footer table metadata stored in table JSON. The “autocorpus_tables.key” file describes the contents of the tables JSON file (<a href="https://github.com/omicsNLP/Auto-CORPus/blob/main/keyFiles/autocorpus_tables.key" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://github.com/omicsNLP/Auto-CORPus/blob/main/keyFiles/autocorpus_tables.key</a>).</p></figcaption></figure></section><section id="sec9"><h4 class="pmc_sec_title">Processing Table HTML</h4>
+<p>Tables can used within HTML documents for formatting web page layouts and are distinct from the <em>data tables</em> processed by Auto-CORPus. The configuration file set by the user identifies the HTML elements used to define data table containers, which include title, caption, footer, and table content. The files processed can either be a full-text HTML file for inline tables and/or separate HTML files for individual linked tables. The Auto-CORPus algorithm for processing tables is based on the functional and structural table analysis method described by Milosevic et al. (<a href="#B8" class="usa-link" aria-describedby="B8">8</a>). The cells that contain navigational information such as column headers and section headings are identified. If a column has header strings contained in cells spanning multiple rows, the strings are concatenated with a pipe character separator to form a single column header string. The “super row” is a single text string that spans a complete row (multiple columns) within the table body. The “index column” is a single text string in the first column (sometimes known as a stub) within the table body when either only the first column does not have a header, or the cell spans more than one row. The presence of a super row or index column indicates a table section division where the previous section (if present) ends, and a new section starts. The super row or index column text string provides the section name. A nested array data structure of table content is built to relate column headers to data rows, working from top to bottom and left to right, with section headings occurring in between and grouping data rows. The algorithm extracts the table metadata of title, footer and caption. Table content and metadata are output in the table JSON format. The contents of table cells can be either string or number data types (we consider “true” and “false” booleans as strings) and are represented in the output file using the respective JSON data type. Cells that contain only scientific notation are converted to exponential notation and stored as a JSON number data type. All HTML text formatting is removed, however this distorts the meaning of positive exponents in text strings, for example <em>n</em> = <em>10</em><sup>3</sup> is represented as <em>n</em> = <em>103</em>. To preserve the meaning of exponents within text strings, superscript characters are identified using superscript HTML element markup, for example <em>n = 10 &lt;sup&gt;3&lt;/sup&gt;</em>.</p>
+<p>Some publication tables contain content that could be represented in two or more separate tables. These multi-dimensional tables use the same gridlines, but new column headers are declared after initial column headers and data rows have appeared in the table. New column headers are identified by looking down columns and classifying each cell as one of three types: numerical, textual, and a mix of numbers and text. The type for a column is determined by the dominant cell type of all rows in a column excluding super rows. After the type of all columns are determined, the algorithm loops through all rows except super rows, and if more than half of cells in the row do not match with the columns' types, the row is identified as a new header row, and the rows that follow the new headers are then regarded as a sub-table. Auto-CORPus represents sub-tables as distinct tables in the table JSON, with identical metadata to the initial table. Tables are identified by the table number used in the publication, so since sub-tables will share their table number with the initial table, a new identifier is created for sub-tables with the initial table number, an underscore, then a sub-table number such as “1_1.”</p></section></section><section id="sec10"><h3 class="pmc_sec_title">Comparative Analysis of Outputs</h3>
+<p>The correspondence between PMC BioC and Auto-CORPus BioC outputs were compared to evaluate whether all information present in the PMC BioC output also appears in the Auto-CORPus BioC output. This was done by analyzing the number of characters in the PMC BioC JSON that appear in the same order in the Auto-CORPus BioC JSON using the longest common subsequence method. With this method, overlapping sequences of characters that vary in length are extracted from the PMC BioC string to find a matching sequence in the Auto-CORPus string. With this method it can occur that a subsequence from the PMC BioC matches to multiple parts of the Auto-CORPus BioC string (e.g., repeated words). This is mitigated by evaluating matches of overlapping/adjacent subsequences which should all be close to each other as they appear in the PMC BioC text.</p>
+<p>This longest common subsequence method was applied to each individual paragraph of the PMC BioC input and compared with the Auto-CORPus BioC paragraphs. This method was chosen over other string metric algorithms, such as the Levenshtein distance or cosine-similarity, due to it being non-symmetric/unidirectional (the Auto-CORPus BioC output strings contain more information (e.g., figure/table links, references) than the PMC BioC output) and ability to directly extract different characters.</p></section></section><section id="s3"><h2 class="pmc_sec_title">Results</h2>
+<section id="sec12"><h3 class="pmc_sec_title">Data for the Evaluation of Algorithms</h3>
+<p>We attempted to download PMC BioC JSON format for all 1,200 GWAS PMC publications in our OA dataset, but only 766 were available as BioC from the NCBI server. We refer to this as the “PMC BioC dataset.” For the 766 PMC articles where we could obtain a NCBI BioC file, we processed the equivalent PMC HTML files using Auto-CORPus. We used only the BioC output files and refer to this as the “Auto-CORPus BioC dataset.” To compare the Auto-CORPus BioC and table outputs for PMC and publisher-specific versions, we accessed 163 Nature Communication and 5 Nature Genetics articles that overlap with the OA dataset and were not present in the publisher dataset, so they were unseen data. These journals have linked tables, so full-text and all linked table HTML files were accessed (367 linked table files). Auto-CORPus configuration files were setup for the journals to process the publisher-specific files and the BioC and table JSON output files were collated into what we refer to as the “linked table dataset.” The equivalent PMC HTML files from the OA dataset were also processed by Auto-CORPus and the BioC and table JSON files form the “inline table dataset.”</p></section><section id="sec13"><h3 class="pmc_sec_title">Performance of Auto-CORPus Full-Text Processing</h3>
+<p>The proportion of characters from 3,195 full-text paragraphs in the PMC BioC dataset that also appear in the Auto-CORPus BioC dataset in the same order in the paragraph string were evaluated using the longest common subsequence method. The median and interquartile range of the (left-skewed) similarity are 100% and 100–100%, respectively. Differences between the Auto-CORPus and PMC outputs are shown in <a href="#T4" class="usa-link">Table 4</a> and relate to how display items, abbreviations and links are stored, and different character encodings. A structural difference between the two outputs is in how section titles are associated to passage text. In PMC BioC the section titles (and subtitles) are distinct from the passages they describe as both are treated as equivalent text. The section title occurs once in the file and the passage(s) it refers to follows it. In Auto-CORPus BioC the (first level) section titles (and subtitles) are linked directly with the passage text they refer to, and are included for each paragraph. Auto-CORPus uses IAO to classify text sections so, for example, the introduction title and text are grouped into a section annotated as introduction, rather than splitting these into two subsections (introduction title and introduction text as separate entities in the PMC BioC output) which would not fit with the IAO structure.</p>
+<section class="tw xbox font-sm" id="T4"><h4 class="obj_head">Table 4.</h4>
+<div class="caption p"><p>Differences between the Auto-CORPus BioC and PMC BioC JSON outputs.</p></div>
+<div class="tbl-box p" tabindex="0"><table class="content" frame="hsides" rules="groups">
+<thead><tr>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Difference</strong>
+</th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>Auto-CORPus</strong>
+</th>
+<th valign="top" align="left" rowspan="1" colspan="1">
+<strong>PMC</strong>
+</th>
+</tr></thead>
+<tbody>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Section titles</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Section titles, subtitles, subsubtitles (and so on) are linked to the passage text they apply to</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Section titles, subtitles, subsubtitles (and so on) precede the passage text they apply to</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Section types</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Section types are annotated using IAO terms</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Section types are described using custom labels</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Offset counts</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Offset increased by 1 for every character (including whitespace) in a passage</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Offset increased by the number of bytes in the text of a passage plus one space</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Table and figure sections</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Structured table data are stored in table JSON. Figure captions are included in the BioC JSON in the sequential order in which they occur within paragraphs.</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Table data and figure captions occur at the end of the JSON document. Table content is given as XML.</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Abbreviations section</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Abbreviations section stored in abbreviations JSON. Abbreviation and definition components are related. Incomplete/one-sided definitions are not stored.</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Abbreviations and definitions from the abbreviations section are stored separately as text with no relations between the two components. Incomplete/one-sided definitions are stored.</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Link anchor text</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Link anchor text retained (HTML element tags removed).</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Link anchor text removed.</td>
+</tr>
+<tr>
+<td valign="top" align="left" rowspan="1" colspan="1">Character encoding</td>
+<td valign="top" align="left" rowspan="1" colspan="1">UTF-8 used for outputs</td>
+<td valign="top" align="left" rowspan="1" colspan="1">Available in Unicode and ASCII</td>
+</tr>
+</tbody>
+</table></div>
+<div class="p text-right font-secondary"><a href="table/T4/" class="usa-link" target="_blank" rel="noopener noreferrer">Open in a new tab</a></div></section><p>The Auto-CORPus BioC output includes the figure captions where they appear in the text and a separate table JSON file to store the table data, whereas the PMC BioC adds these data at the end of the JSON document and provides table content as a block of XML. Abbreviation sections are not included in the Auto-CORPus BioC output since Auto-CORPus provides a dedicated abbreviations JSON output. In the PMC BioC format the abbreviations and definitions are not related, whereas in the Auto-CORPus abbreviations JSON output the two elements are related. If an abbreviation does not contain a definition in the abbreviations section (perhaps due to an editorial error), PMC BioC will include the undefined thus meaningless abbreviation string, whereas Auto-CORPus will ignore it. Link anchor text to figures, tables, references and URLs are retained in the Auto-CORPus output but removed in the PMC BioC output. The most common differences between the two BioC versions is the encodings/strings used to reflect different whitespace characters and other special characters, with the remaining content being identical.</p>
+<p>The proportion of characters from 9,468 full-text paragraphs in the publisher dataset that also appear in the Auto-CORPus PMC BioC dataset in the same order in the paragraph string were evaluated. The median and interquartile range of the (left-skewed) similarity is also 100 and 100–100%, respectively, and differences between the PMC and publisher-versions are the same as those previously observed and reported in <a href="#T4" class="usa-link">Table 4</a>.</p>
+<p>Last, we evaluated the section title mapping to IAO terms for publication from non-biomedical domains (physics, psychology). We observed that not all publications from these domains have standardized headers that can be mapped directly or with fuzzy matching and require the digraph to map headers. Most headers are mapped correctly either to one or multiple (potential) IAO terms (<a href="#SM1" class="usa-link">Supplementary Table 1</a>). Only one publication contained a mismatch where two sections were mapped to introduction and methods sections, respectively, where each of these contained sub-headers that relate to introduction, methods and results. In two physics publications we encountered the case where the “proportional to” sign (∝) could not be mapped by the encoder.</p>
+<section id="sec14"><h4 class="pmc_sec_title">Performance of Auto-CORPus Table Processing</h4>
+<p>We assessed the accuracy of the table JSON output generated from non-PMC linked tables compared with table JSON output generated from the equivalent PMC HTML with inline tables. The comparative analysis method described above was used for comparing BioC output from the linked table and inline table datasets, except here it was applied to both strings (bidirectional, taking the maximum value of both outcomes). This is equivalent to the Levenshtein similarity applied to transform the larger string into the smaller string, with the exception that the different characters for both comparisons are retained for identifying the differences. The correspondence between table JSON files in the linked table and inline table datasets was calculated as the number of characters correctly represented in the publishers table JSON output relative to the PMC versions [also using the (symmetric) longest common subsequence method]. Both the text and table similarity are represented as the median (inter-quartile range) to account for non-normal distributions of the data. Any differences identified during these analyses were at the paragraph or table row level, enabling manual investigation of these sections in a side-by-side comparison of the files.</p>
+<p>The proportion of characters from 367 tables in the linked table dataset that also appear in the inline table dataset in the same order in the cell or text string were evaluated. The median and interquartile range of the (left-skewed) similarity is 100 and 99.79–100.00%, respectively. We found that there were structural differences between some of the output files where additional data rows were present in the JSON files generated from the publisher's files. This occurred because cell value strings in tables from the publisher's files were split across two rows, however in the PMC version the string was formatted (wrapped) to be contained within a single row. The use of different table structures to contain the same data resulted in accurate but differing table JSON outputs. Most of the differences between table content and metadata values pertain to the character encoding used in the different table versions. For example, we have found different uses of hyphen/em dash/en dash/minus symbols between different versions, and Greek letters were represented differently in the different table versions. Other differences are related to how numbers are represented in scientific notation. If a cell contains a number only, then it is represented as a JSON number data type in the output. However, if the cell contains non-numeric characters, then there is no standardization of the cell text and the notation used (e.g., the × symbol or E notation) will be reproduced in the JSON output. When there is variation in notation between sources, the JSON outputs will differ. Other editorial differences include whether thousands are represented with or without commas and how whitespace characters are used. Despite these variations there was no information loss between processed inline and linked tables.</p></section></section><section id="sec15"><h3 class="pmc_sec_title">Application: NER on GWAS Publications</h3>
+<p>Our intention is that Auto-CORPus supports information extraction from the biomedical literature. To demonstrate the use of Auto-CORPus outputs within a real-world application and aligned to the authors' expertise to support the evaluation of the results, we applied named-entity recognition (NER) to the Auto-CORPus BioC full-text output to extract GWAS metadata. Study metadata are included in curated GWAS databases, such as GWAS Central, and the ability to extract these entities automatically could provide a valuable curation aid. Full details of the method and the rationale behind the application is provided in the <a href="#SM1" class="usa-link">Supplementary Methods</a>. In summary, we filtered out sentences in the methods sections from the BioC full-text output that contain information on the genotyping platforms, assays, total number of genetic variants, quality control and imputation that were used. We trained five separate algorithms for NER (one for each metadata type) using 700 GWAS publications and evaluated these on 500 GWAS publications of the test set. The F1-scores for the five tasks are between 0.82 and 1.00 (<a href="#SM1" class="usa-link">Supplementary Table 2</a>) with examples given in <a href="#SM1" class="usa-link">Supplementary Figure 4</a>.</p></section></section><section id="s4"><h2 class="pmc_sec_title">Discussion</h2>
+<section id="sec17"><h3 class="pmc_sec_title">Strengths and Limitations</h3>
+<p>We have shown that Auto-CORPus brings together and bolsters several disjointed standards (BioC and IAO) and algorithmic components (for processing tables and abbreviations) of scientific literature analytics into a convenient and reliable tool for standardizing full-text and tables. The BioC format is a useful but not ubiquitous standard for representing text and annotations. Auto-CORPus enables the transformation of the widely available HTML format into BioC JSON following the setup of a configuration file associated with the structure of the HTML documents. The use of the configuration file drives the flexibility of the package, but also restricts use to users who are confident exploring HTML document structures. We make available the configuration files used in the evaluations described in this paper. To process additional sources, an upfront time investment is required from the user to explore the HTML structure and set the configuration file. We will be increasing the number of configuration files available for larger publishers, and we help non-technical users by providing documentation to explain how to setup configuration files. We welcome configuration files submitted by users and the documentation describes the process for users to submit files. Configuration files contain a section for tracking contributions made to the file, so the names of authors and editors can be logged. Once a configuration file has been submitted and tested, the file will be included within the Auto-CORPus package and the user credited (should they wish) with authorship of the file.</p>
+<p>The inclusion of IAO terms within the Auto-CORPus BioC output standardizes the description of publication sections across all processed sources. The digraph that is used to assign unmapped paragraph headers to standard IAO terms was constructed using both GWAS and MWAS literature to avoid training it to be used for a single domain only. We have tested the algorithms on PMC articles from three different physics and three psychology journals to confirm the BioC JSON output and IAO term recognition extend beyond only biomedical literature. Virtually all header terms from these articles were mapped to relevant IAO terms even when not all headers could be mapped, however some sections were mapped to multiple IAO terms based on paths in the digraph. Since ontologies are stable but not static, any resource or service that relies on one ontology structure could become outdated or redundant as the ontology is updated. We will rerun the fuzzy matching of headers to IAO terms and regenerate the digraph as new terms are introduced to the <em>document part</em> branch of IAO. We have experience of this when our first group of term suggestions based on the digraph were included into the IAO.</p>
+<p>The BioC output of abbreviations contains the abbreviation, definition and the algorithm(s) by which each pair was identified. One limitation of the current full-text abbreviation algorithm is that it searches for abbreviations in brackets and therefore will not find abbreviations for which the definition is in brackets, or abbreviations that are defined without use of brackets. The current structure of the abbreviation JSON allows additional methods to be included alongside the two methods currently used. Adding further algorithms to find different types of abbreviation in the full-text is considered as part of future work.</p>
+<p>Auto-CORPus implements a method for extracting table structures and data that was developed to extract table information from XML formatted tables (<a href="#B8" class="usa-link" aria-describedby="B8">8</a>). The use of the configuration file for identifying table containers enables the table processing to be focused on relevant data tables and exclude other tables associated with web page formatting. Auto-CORPus is distinct from other work in this field that uses machine learning methods to classify the types of information within tables (<a href="#B16" class="usa-link" aria-describedby="B16">16</a>). Auto-CORPus table processing is agnostic to the extracted variables, with the only distinction made between numbers and strings for the pragmatic reason of correctly formatting the JSON data type. The table JSON files could be used in downstream analysis (and annotation) of cell information types, but the intention of Auto-CORPus is to provide the capability to generate a faithful standardized output from any HTML source file. We have shown high accuracy (&gt;99%) for the tables we have processed with a configuration file and the machine learning method was shown to recover data from ~86% of tables (<a href="#B16" class="usa-link" aria-describedby="B16">16</a>). Accurate extraction is possible across more data sources with the Auto-CORPus rule-based approach, but a greater investment in setup time is required.</p>
+<p>Auto-CORPus focuses on HTML versions of articles as these are readily and widely available within the biomedical domain. Currently the processing of PDF documents is not supported, but the work by the Semantic Scholar group to convert PDF documents to HTML is encouraging as they observed that 87% of PDF documents processed showed little to no readability issues (<a href="#B4" class="usa-link" aria-describedby="B4">4</a>). The ability to leverage reliable document transformation will have implications for processing supplementary information files and broader scientific literature sources which are sometimes only available in PDF format, and therefore will require conversion to the accessible and reusable HTML format.</p></section><section id="sec18"><h3 class="pmc_sec_title">Future Research and Conclusions</h3>
+<p>We found that the tables for some publications are made available as images (see <a href="#T1" class="usa-link">Table 1</a>), so could not be processed by Auto-CORPus. To overcome this gap in publication table standardization, we are refining a plugin for Auto-CORPus that provides an algorithm for processing images of tables. The algorithm leverages Google's Tesseract optical character recognition engine to extract text from preprocessed table images. An overview of the table image processing pipeline is available in <a href="#SM1" class="usa-link">Supplementary Figure 3</a>. During our preliminary evaluation of the plugin, it achieved an accuracy of ~88% when processing a collection of 200 JPG and PNG table images taken from 23 different journals. Although encouraging, there are caveats in that the image formats must be of high resolution, the algorithm performs better on tables with gridlines than tables without gridlines, special characters are rarely interpreted correctly, and cell text formatting is lost. We are fine tuning the Tesseract model by training new datasets on biomedical data. An alpha release of the table image processing plugin is available with the Auto-CORPus package.</p>
+<p>The authors are involved in omics health data NLP projects that use Auto-CORPus within text mining pipelines to standardize and optimize biomedical literature ahead of entity and relation annotations and have given examples in the <a href="#SM1" class="usa-link">Supplementary Material</a> of how the Auto-CORPus output was used to train these algorithms. The BioC format supports the stand-off annotation of linguistic features such as tokens, part-of-speech tags and noun phrases, as well as the annotation of relations between these elements (<a href="#B5" class="usa-link" aria-describedby="B5">5</a>). We are developing machine learning methods to automatically extract genome-wide association study (GWAS) data from peer-reviewed literature. High quality annotated datasets are required to develop and train NLP algorithms and validate the outputs. We are developing a GWAS corpus that can be used for this purpose using a semi-automated annotation method. The GWAS Central database is a comprehensive collection of summary-level GWAS findings imported from published research papers or submitted by study authors (<a href="#B13" class="usa-link" aria-describedby="B13">13</a>). For GWAS Central studies, we used Auto-CORPus to standardize the full-text publication text and tables. In an automatic annotation step, for each publication, all GWAS Central association data was retrieved. Association data consists of three related entities: a phenotype/disease description, genetic marker, and an association <em>P</em>-value. A named entity recognition algorithm identifies the database entities in the Auto-CORPus BioC and table JSON files. The database entities and relations are mapped back onto the text, by expressing the annotations in BioC format and appending these to the relevant BioC element in the JSON files. The automatic annotations are then manually evaluated using the TeamTat text annotation tool which provides a user-friendly interface for annotating entities and relations (<a href="#B17" class="usa-link" aria-describedby="B17">17</a>). We use TeamTat to manually inspect the automatic annotations and modify or remove incorrect annotations, in addition to including new annotations that were not automatically generated. TeamTat accepts BioC input files and outputs in BioC format, thus the Auto-CORPus files that have been automatically annotated are suitable for importing into TeamTat. Work to create the GWAS corpus is ongoing, but the convenient semi-automatic process for creating high-quality annotations from biomedical literature HTML files described here could be adapted for creating other gold-standard corpora.</p>
+<p>In related work, we are developing a corpus for MWAS for metabolite named-entity recognition to enable the development of new NLP tools to speed up literature review. As part of this, the active development focuses on extending Auto-CORPus to analyse preprint literature and <a href="#SM1" class="usa-link">Supplementary Materials</a>, improving the abbreviation detection, and development of more configuration files. Our preliminary work on preprint literature has shown we can map paragraphs in Rxiv versions to paragraphs in the peer-reviewed manuscript with the high accuracy (average similarity of paragraphs &gt;95%). Another planned extension is to classify paragraphs based on the text in the case where headers are mapped to multiple IAO terms. The flexibility of the Auto-CORPus configuration file enables researchers to use Auto-CORPus to process publications and data from a broad variety of sources to create reusable corpora for many use cases in biomedical literature and other scientific fields.</p></section></section><section id="s5"><h2 class="pmc_sec_title">Data Availability Statement</h2>
+<p>Publicly available datasets were analyzed in this study. The Auto-CORPus package is freely available from GitHub (<a href="https://github.com/omicsNLP/Auto-CORPus" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://github.com/omicsNLP/Auto-CORPus</a>) and can be deployed on local machines as well as using high-performance computing to process publications in batch. A step-by-step guide to detail how to use Auto-CORPus is supplied with the package. Data from both Open Access (<em>via</em> PubMed Central) and publisher repositories are used, the latter were downloaded within university library licenses and cannot be shared.</p></section><section id="s6"><h2 class="pmc_sec_title">Author Contributions</h2>
+<p>TB and JP designed and supervised the research and wrote the manuscript. TB contributed the GWAS use case and JP contributed the MWAS/metabolomics use cases. TS developed the BioC outputs and led the coding integration aspects. YH developed the section header standardization algorithm and implemented the abbreviation recognition algorithm. ZL developed the table image recognition and processing algorithm. SS developed the table extraction algorithm and main configuration file. CP developed configuration files for preprint texts. NM developed the NER algorithms for GWAS entity recognition. NM, FM, CY, ZL, and CP tested the package and performed comparative analysis of outputs. TR refined standardization of full-texts and contributed algorithms for character set conversions. All authors read, edited, and approved the manuscript.</p></section><section id="s7"><h2 class="pmc_sec_title">Funding</h2>
+<p>This work has been supported by Health Data Research (HDR) UK and the Medical Research Council <em>via</em> an UKRI Innovation Fellowship to TB (MR/S003703/1) and a Rutherford Fund Fellowship to JP (MR/S004033/1).</p></section><section id="conf1"><h2 class="pmc_sec_title">Conflict of Interest</h2>
+<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></section><section id="s8"><h2 class="pmc_sec_title">Publisher's Note</h2>
+<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></section><section id="ack1" class="ack"><h2 class="pmc_sec_title">Acknowledgments</h2>
+<p>We thank Mohamed Ibrahim (University of Leicester) for identifying different configurations of tables for different HTML formats.</p></section><section id="s9"><h2 class="pmc_sec_title">Supplementary Material</h2>
+<p>The Supplementary Material for this article can be found online at: <a href="https://www.frontiersin.org/articles/10.3389/fdgth.2022.788124/full#supplementary-material" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://www.frontiersin.org/articles/10.3389/fdgth.2022.788124/full#supplementary-material</a></p>
+<section class="sm xbox font-sm" id="SM1"><div class="media p"><div class="caption">
+<a href="/articles/instance/8885717/bin/Data_Sheet_1.PDF" data-ga-action="click_feat_suppl" class="usa-link">Click here for additional data file.</a><sup> (1.8MB, PDF) </sup>
+</div></div></section></section><section id="ref-list1" class="ref-list"><h2 class="pmc_sec_title">References</h2>
+<section id="ref-list1_sec2"><ul class="ref-list font-sm" style="list-style-type:none">
+<li id="B1">
+<span class="label">1.</span><cite>Sheikhalishahi S, Miotto R, Dudley JT, Lavelli A, Rinaldi F, Osmani V. Natural language processing of clinical notes on chronic diseases: systematic review. JMIR Med Inform. (2019) 7:e12239. 10.2196/12239</cite> [<a href="https://doi.org/10.2196/12239" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="/articles/PMC6528438/" class="usa-link">PMC free article</a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/31066697/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=JMIR%20Med%20Inform.&amp;title=Natural%20language%20processing%20of%20clinical%20notes%20on%20chronic%20diseases:%20systematic%20review&amp;author=S%20Sheikhalishahi&amp;author=R%20Miotto&amp;author=JT%20Dudley&amp;author=A%20Lavelli&amp;author=F%20Rinaldi&amp;volume=7&amp;publication_year=2019&amp;pages=e12239&amp;pmid=31066697&amp;doi=10.2196/12239&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B2">
+<span class="label">2.</span><cite>Jackson RG, Patel R, Jayatilleke N, Kolliakou A, Ball M, Gorrell G, et al. Natural language processing to extract symptoms of severe mental illness from clinical text: the Clinical Record Interactive Search Comprehensive Data Extraction (CRIS-CODE) project. BMJ Open. (2017) 7:e012012. 10.1136/bmjopen-2016-012012</cite> [<a href="https://doi.org/10.1136/bmjopen-2016-012012" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="/articles/PMC5253558/" class="usa-link">PMC free article</a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/28096249/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=BMJ%20Open&amp;title=Natural%20language%20processing%20to%20extract%20symptoms%20of%20severe%20mental%20illness%20from%20clinical%20text:%20the%20Clinical%20Record%20Interactive%20Search%20Comprehensive%20Data%20Extraction%20(CRIS-CODE)%20project&amp;author=RG%20Jackson&amp;author=R%20Patel&amp;author=N%20Jayatilleke&amp;author=A%20Kolliakou&amp;author=M%20Ball&amp;volume=7&amp;publication_year=2017&amp;pages=e012012&amp;pmid=28096249&amp;doi=10.1136/bmjopen-2016-012012&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B3">
+<span class="label">3.</span><cite>Erhardt RA, Schneider R, Blaschke C. Status of text-mining techniques applied to biomedical text. Drug Discov Today. (2006) 11:315–25. 10.1016/j.drudis.2006.02.011</cite> [<a href="https://doi.org/10.1016/j.drudis.2006.02.011" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/16580973/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=Drug%20Discov%20Today.&amp;title=Status%20of%20text-mining%20techniques%20applied%20to%20biomedical%20text&amp;author=RA%20Erhardt&amp;author=R%20Schneider&amp;author=C%20Blaschke&amp;volume=11&amp;publication_year=2006&amp;pages=315-25&amp;pmid=16580973&amp;doi=10.1016/j.drudis.2006.02.011&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B4">
+<span class="label">4.</span><cite>Wang LL, Cachola I, Bragg J, Yu-Yen Cheng E, Haupt C, Latzke M, et al. Improving the accessibility of scientific documents: current state, user needs, and a system solution to enhance scientific PDF accessibility for blind and low vision users. arXiv e-prints: arXiv:2105.00076 (2021). Available online at: <a href="https://arxiv.org/pdf/2105.00076.pdf" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://arxiv.org/pdf/2105.00076.pdf</a></cite>
+</li>
+<li id="B5">
+<span class="label">5.</span><cite>Comeau DC, Islamaj Dogan R, Ciccarese P, Cohen KB, Krallinger M, Leitner F, et al. BioC: a minimalist approach to interoperability for biomedical text processing. Database. (2013) 2013:bat064. 10.1093/database/bat064</cite> [<a href="https://doi.org/10.1093/database/bat064" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="/articles/PMC3889917/" class="usa-link">PMC free article</a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/24048470/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=Database.&amp;title=BioC:%20a%20minimalist%20approach%20to%20interoperability%20for%20biomedical%20text%20processing&amp;author=DC%20Comeau&amp;author=R%20Islamaj%20Dogan&amp;author=P%20Ciccarese&amp;author=KB%20Cohen&amp;author=M%20Krallinger&amp;volume=2013&amp;publication_year=2013&amp;pages=bat064&amp;pmid=24048470&amp;doi=10.1093/database/bat064&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B6">
+<span class="label">6.</span><cite>Comeau DC, Wei CH, Islamaj Dogan R, Lu Z. PMC text mining subset in BioC: about three million full-text articles and growing. Bioinformatics. (2019) 35:3533–5. 10.1093/bioinformatics/btz070</cite> [<a href="https://doi.org/10.1093/bioinformatics/btz070" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="/articles/PMC6748740/" class="usa-link">PMC free article</a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/30715220/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=Bioinformatics.&amp;title=PMC%20text%20mining%20subset%20in%20BioC:%20about%20three%20million%20full-text%20articles%20and%20growing&amp;author=DC%20Comeau&amp;author=CH%20Wei&amp;author=R%20Islamaj%20Dogan&amp;author=Z%20Lu&amp;volume=35&amp;publication_year=2019&amp;pages=3533-5&amp;pmid=30715220&amp;doi=10.1093/bioinformatics/btz070&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B7">
+<span class="label">7.</span><cite>Ceusters W. An information artifact ontology perspective on data collections and associated representational artifacts. Stud Health Technol Inform. (2012) 180:68–72. 10.3233/978-1-61499-101-4-68</cite> [<a href="https://doi.org/10.3233/978-1-61499-101-4-68" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/22874154/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=Stud%20Health%20Technol%20Inform.&amp;title=An%20information%20artifact%20ontology%20perspective%20on%20data%20collections%20and%20associated%20representational%20artifacts&amp;author=W%20Ceusters&amp;volume=180&amp;publication_year=2012&amp;pages=68-72&amp;pmid=22874154&amp;doi=10.3233/978-1-61499-101-4-68&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B8">
+<span class="label">8.</span><cite>Milosevic N, Gregson C, Hernandez R, Nenadic G. Disentangling the structure of tables in scientific literature. In: Métais E, Meziane F, Saraee M, Sugumaran V, Vadera S, editors. Natural Language Processing and Information Systems. Cham: Springer International Publishing;  (2016). p. 162–74. 10.1007/978-3-319-41754-7_14</cite> [<a href="https://doi.org/10.1007/978-3-319-41754-7_14" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="https://scholar.google.com/scholar_lookup?title=Natural%20Language%20Processing%20and%20Information%20Systems&amp;author=N%20Milosevic&amp;author=C%20Gregson&amp;author=R%20Hernandez&amp;author=G%20Nenadic&amp;publication_year=2016&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B9">
+<span class="label">9.</span><cite>Craven M, Kumlien J. Constructing biological knowledge bases by extracting information from text sources. In: International Conference on Intelligent Systems for Molecular Biology.
+Heidelberg:  (1999) p. 77–86. </cite> [<a href="https://pubmed.ncbi.nlm.nih.gov/10786289/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?title=International%20Conference%20on%20Intelligent%20Systems%20for%20Molecular%20Biology.&amp;author=M%20Craven&amp;author=J%20Kumlien&amp;publication_year=1999&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B10">
+<span class="label">10.</span><cite>Blaschke C, Andrade MA, Ouzounis C, Valencia A. Automatic extraction of biological information from scientific text: protein-protein interactions. In: International Conference on Intelligent Systems for Molecular Biology. Heidelberg:  (1999) p. 60–7. </cite> [<a href="https://pubmed.ncbi.nlm.nih.gov/10786287/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?title=International%20Conference%20on%20Intelligent%20Systems%20for%20Molecular%20Biology&amp;author=C%20Blaschke&amp;author=MA%20Andrade&amp;author=C%20Ouzounis&amp;author=A%20Valencia&amp;publication_year=1999&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B11">
+<span class="label">11.</span><cite>Andrade MA, Valencia A. Automatic annotation for biological sequences by extraction of keywords from MEDLINE abstracts. Development of a prototype system. Proc Int Conf Intell Syst Mol Biol. (1997) 5:25–32. </cite> [<a href="https://pubmed.ncbi.nlm.nih.gov/9322011/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=Proc%20Int%20Conf%20Intell%20Syst%20Mol%20Biol.&amp;title=Automatic%20annotation%20for%20biological%20sequences%20by%20extraction%20of%20keywords%20from%20MEDLINE%20abstracts.%20Development%20of%20a%20prototype%20system&amp;author=MA%20Andrade&amp;author=A%20Valencia&amp;volume=5&amp;publication_year=1997&amp;pages=25-32&amp;pmid=9322011&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B12">
+<span class="label">12.</span><cite>Schwartz AS, Hearst MA. A simple algorithm for identifying abbreviation definitions in biomedical text. Pac Symp Biocomput. (2003) 8:451–62. Available online at: <a href="https://psb.stanford.edu/psb-online/proceedings/psb03/schwartz.pdf" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://psb.stanford.edu/psb-online/proceedings/psb03/schwartz.pdf</a></cite> [<a href="https://pubmed.ncbi.nlm.nih.gov/12603049/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=Pac%20Symp%20Biocomput.&amp;title=A%20simple%20algorithm%20for%20identifying%20abbreviation%20definitions%20in%20biomedical%20text&amp;author=AS%20Schwartz&amp;author=MA%20Hearst&amp;volume=8&amp;publication_year=2003&amp;pages=451-62&amp;pmid=12603049&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B13">
+<span class="label">13.</span><cite>Beck T, Shorter T, Brookes AJ. GWAS Central: a comprehensive resource for the discovery and comparison of genotype and phenotype data from genome-wide association studies. Nucleic Acids Res. (2020) 48:D933–40. 10.1093/nar/gkz895</cite> [<a href="https://doi.org/10.1093/nar/gkz895" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="/articles/PMC7145571/" class="usa-link">PMC free article</a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/31612961/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=Nucleic%20Acids%20Res&amp;title=GWAS%20Central:%20a%20comprehensive%20resource%20for%20the%20discovery%20and%20comparison%20of%20genotype%20and%20phenotype%20data%20from%20genome-wide%20association%20studies&amp;author=T%20Beck&amp;author=T%20Shorter&amp;author=AJ%20Brookes&amp;volume=48&amp;publication_year=2020&amp;pages=D933-40&amp;pmid=31612961&amp;doi=10.1093/nar/gkz895&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B14">
+<span class="label">14.</span><cite>Ghazvinian A, Noy NF, Musen MA. Creating mappings for ontologies in biomedicine: simple methods work. AMIA Annu Symp Proc. (2009) 2009:198–202. </cite> [<a href="/articles/PMC2815474/" class="usa-link">PMC free article</a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/20351849/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=AMIA%20Annu%20Symp%20Proc.&amp;title=Creating%20mappings%20for%20ontologies%20in%20biomedicine:%20simple%20methods%20work&amp;author=A%20Ghazvinian&amp;author=NF%20Noy&amp;author=MA%20Musen&amp;volume=2009&amp;publication_year=2009&amp;pages=198-202&amp;pmid=20351849&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B15">
+<span class="label">15.</span><cite>Keller MF, Reiner AP, Okada Y, van Rooij FJ, Johnson AD, Chen MH, et al. Trans-ethnic meta-analysis of white blood cell phenotypes. Hum Mol Genet. (2014) 23:6944–60. 10.1093/hmg/ddu401</cite> [<a href="https://doi.org/10.1093/hmg/ddu401" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="/articles/PMC4245044/" class="usa-link">PMC free article</a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/25096241/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=Hum%20Mol%20Genet.&amp;title=Trans-ethnic%20meta-analysis%20of%20white%20blood%20cell%20phenotypes&amp;author=MF%20Keller&amp;author=AP%20Reiner&amp;author=Y%20Okada&amp;author=FJ%20van%20Rooij&amp;author=AD%20Johnson&amp;volume=23&amp;publication_year=2014&amp;pages=6944-60&amp;pmid=25096241&amp;doi=10.1093/hmg/ddu401&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B16">
+<span class="label">16.</span><cite>Milosevic N, Gregson C, Hernandez R, Nenadic G. A framework for information extraction from tables in biomedical literature. Int J Docum Anal Recogn. (2019) 22:55–78. 10.1007/s10032-019-00317-023869631</cite> [<a href="https://doi.org/10.1007/s10032-019-00317-0" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=Int%20J%20Docum%20Anal%20Recogn.&amp;title=A%20framework%20for%20information%20extraction%20from%20tables%20in%20biomedical%20literature&amp;author=N%20Milosevic&amp;author=C%20Gregson&amp;author=R%20Hernandez&amp;author=G%20Nenadic&amp;volume=22&amp;publication_year=2019&amp;pages=55-78&amp;doi=10.1007/s10032-019-00317-0&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+<li id="B17">
+<span class="label">17.</span><cite>Islamaj R, Kwon D, Kim S, Lu Z. TeamTat: a collaborative text annotation tool. Nucleic Acids Res. (2020) 48:W5–11. 10.1093/nar/gkaa333</cite> [<a href="https://doi.org/10.1093/nar/gkaa333" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">DOI</a>] [<a href="/articles/PMC7319445/" class="usa-link">PMC free article</a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/32383756/" class="usa-link">PubMed</a>] [<a href="https://scholar.google.com/scholar_lookup?journal=Nucleic%20Acids%20Res&amp;title=TeamTat:%20a%20collaborative%20text%20annotation%20tool&amp;author=R%20Islamaj&amp;author=D%20Kwon&amp;author=S%20Kim&amp;author=Z%20Lu&amp;volume=48&amp;publication_year=2020&amp;pages=W5-11&amp;pmid=32383756&amp;doi=10.1093/nar/gkaa333&amp;" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">Google Scholar</a>]</li>
+</ul></section></section><section id="_ad93_" lang="en" class="associated-data"><h2 class="pmc_sec_title">Associated Data</h2>
+<p class="font-secondary"><em>This section collects any data citations, data availability statements, or supplementary materials included in this article.</em></p>
+<section id="_adsm93_" lang="en" class="supplementary-materials"><h3 class="pmc_sec_title">Supplementary Materials</h3>
+<section class="sm xbox font-sm" id="db_ds_supplementary-material1_reqid_"><div class="media p"><div class="caption">
+<a href="/articles/instance/8885717/bin/Data_Sheet_1.PDF" data-ga-action="click_feat_suppl" class="usa-link">Click here for additional data file.</a><sup> (1.8MB, PDF) </sup>
+</div></div></section></section><section id="_adda93_" lang="en" class="data-availability-statement"><h3 class="pmc_sec_title">Data Availability Statement</h3>
+<p>Publicly available datasets were analyzed in this study. The Auto-CORPus package is freely available from GitHub (<a href="https://github.com/omicsNLP/Auto-CORPus" class="usa-link usa-link--external" data-ga-action="click_feat_suppl" target="_blank" rel="noopener noreferrer">https://github.com/omicsNLP/Auto-CORPus</a>) and can be deployed on local machines as well as using high-performance computing to process publications in batch. A step-by-step guide to detail how to use Auto-CORPus is supplied with the package. Data from both Open Access (<em>via</em> PubMed Central) and publisher repositories are used, the latter were downloaded within university library licenses and cannot be shared.</p></section></section></section><footer class="p courtesy-note font-secondary font-sm text-center"><hr class="headless">
+<p>Articles from Frontiers in Digital Health are provided here courtesy of <strong>Frontiers Media SA</strong></p></footer></section></article>
+
+                      
+
+                    </main>
+                </div>
+            </div>
+        </div>
+
+        
+
+
+
+<!-- Secondary navigation placeholder -->
+<div class="pmc-sidenav desktop:grid-col-4 display-flex">
+    <section class="pmc-sidenav__container" aria-label="Article resources and navigation">
+        <button type="button" class="usa-button pmc-sidenav__container__close usa-button--unstyled">
+            <img src="/static/img/usa-icons/close.svg" role="img" alt="Close" />
+        </button>
+    <div class="display-none desktop:display-block">
+       <section class="margin-top-4 desktop:margin-top-0">
+              <h2 class="margin-top-0">ACTIONS</h2>
+           <ul class="usa-list usa-list--unstyled usa-list--actions">
+               
+               <li>
+                     <a
+                             href="https://doi.org/10.3389/fdgth.2022.788124"
+                             class="usa-button usa-button--outline width-24 font-xs usa-link--external padding-left-0 padding-right-0"
+                             target="_blank"
+                             rel="noreferrer noopener"
+                             data-ga-category="actions"
+                             data-ga-action="click"
+                             data-ga-label="publisher_link_desktop"
+                     >
+                         <span class="height-3 display-inline-flex flex-align-center">View on publisher site</span>
+                     </a>
+               </li>
+               
+               
+               <li>
+                    <a
+                            href="pdf/fdgth-04-788124.pdf"
+                            class="usa-button usa-button--outline width-24 display-inline-flex flex-align-center flex-justify-start padding-left-1"
+                            data-ga-category="actions"
+                            data-ga-action="click"
+                            data-ga-label="pdf_download_desktop"
+                    >
+                         <svg class="usa-icon width-3 height-3" aria-hidden="true" focusable="false" role="img" hidden>
+                            <use xlink:href="/static/img/sprite.svg#file_download"></use>
+                        </svg>
+                        <span class="display-inline-flex flex-justify-center flex-1">PDF (2.8 MB)</span>
+                    </a>
+               </li>
+               
+                
+               <li>
+                   <button role="button" class="usa-button width-24 citation-dialog-trigger display-inline-flex flex-align-center flex-justify-start padding-left-1"
+                        aria-label="Open dialog with citation text in different styles"
+                        data-ga-category="actions"
+                        data-ga-action="open"
+                        data-ga-label="cite_desktop"
+                        data-all-citations-url="/resources/citations/8885717/"
+                        data-citation-style="nlm"
+                        data-download-format-link="/resources/citations/8885717/export/"
+                    >
+                        <svg class="usa-icon width-3 height-3" aria-hidden="true" focusable="false" role="img" hidden>
+                            <use xlink:href="/static/img/sprite.svg#format_quote"></use>
+                        </svg>
+                       <span class="display-inline-flex flex-justify-center flex-1 button-label">Cite</span>
+                    </button>
+               </li>
+                
+               <li>
+
+                        <button class="usa-button width-24 collections-dialog-trigger collections-button display-inline-flex flex-align-center flex-justify-start padding-left-1 collections-button-empty"
+                              aria-label="Save article in MyNCBI collections."
+                              data-ga-category="actions"
+                              data-ga-action="click"
+                              data-ga-label="collections_button_desktop"
+                              data-collections-open-dialog-enabled="false"
+                              data-collections-open-dialog-url="https://account.ncbi.nlm.nih.gov/?back_url=https%3A%2F%2Fpmc.ncbi.nlm.nih.gov%2Farticles%2FPMC8885717%2F%23open-collections-dialog"
+                              data-in-collections="false">
+                            <svg class="usa-icon width-3 height-3 usa-icon--bookmark-full" aria-hidden="true" focusable="false" role="img" hidden>
+                                <use xlink:href="/static/img/action-bookmark-full.svg#icon"></use>
+                            </svg>
+                            <svg class="usa-icon width-3 height-3 usa-icon--bookmark-empty" aria-hidden="true" focusable="false" role="img" hidden>
+                                <use xlink:href="/static/img/action-bookmark-empty.svg#icon"></use>
+                            </svg>
+                            <span class="display-inline-flex flex-justify-center flex-1">Collections</span>
+                       </button>
+               </li>
+               <li class="pmc-permalink">
+                    <button
+                            type="button"
+                            class="usa-button usa-button--outline width-24 display-inline-flex flex-align-center flex-justify padding-left-1 shadow-none"
+                            aria-label="Show article permalink"
+                            aria-expanded="false"
+                            aria-haspopup="true"
+                            data-ga-category="actions"
+                            data-ga-action="open"
+                            data-ga-label="permalink_desktop"
+                    >
+                         <svg class="usa-icon width-3 height-3" aria-hidden="true" focusable="false" role="img" hidden>
+                            <use xlink:href="/static/img/sprite.svg#share"></use>
+                        </svg>
+                        <span class="display-inline-flex flex-justify-center flex-1 button-label">Permalink</span>
+                    </button>
+                   
+
+<div class="pmc-permalink__dropdown" hidden>
+    <div class="pmc-permalink__dropdown__container">
+          <h2 class="usa-modal__heading margin-top-0 margin-bottom-2">PERMALINK</h2>
+          <div class="pmc-permalink__dropdown__content">
+              <input type="text" class="usa-input" value="https://pmc.ncbi.nlm.nih.gov/articles/PMC8885717/" aria-label="Article permalink">
+              <button class="usa-button display-inline-flex pmc-permalink__dropdown__copy__btn margin-right-0" title="Copy article permalink" data-ga-category="save_share" data-ga-action="link" data-ga-label="copy_link">
+                  <svg class="usa-icon" aria-hidden="true" focusable="false" role="img">
+                    <use xlink:href="/static/img/sprite.svg#content_copy"></use>
+                  </svg>
+                  <span class="margin-left-1">Copy</span>
+              </button>
+          </div>
+    </div>
+</div>
+               </li>
+           </ul>
+       </section>
+     </div>
+
+        <section class="pmc-resources margin-top-6 desktop:margin-top-4" data-page-path="/articles/PMC8885717/">
+            <h2 class="margin-top-0">RESOURCES</h2>
+            
+                <div class="usa-accordion usa-accordion--multiselectable" data-allow-multiple>
+                    <h3 class="usa-accordion__heading">
+                        <button
+                        type="button"
+                        class="usa-accordion__button"
+                        aria-expanded="false"
+                        aria-controls="resources-similar-articles"
+                        data-ga-category="resources_accordion"
+                        data-ga-action="open_similar_articles"
+                        data-ga-label="/articles/PMC8885717/"
+                        data-action-open="open_similar_articles"
+                        data-action-close="close_similar_articles"
+                        >
+                            Similar articles
+                        </button>
+                    </h3>
+                    <div
+                            id="resources-similar-articles"
+                            class="usa-accordion__content usa-prose"
+                            
+                                data-source-url="/resources/similar-article-links/35243479/"
+                            
+                    >
+                        
+                    </div>
+                    <h3 class="usa-accordion__heading">
+                        <button
+                        type="button"
+                        class="usa-accordion__button"
+                        aria-expanded="false"
+                        aria-controls="resources-cited-by-other-articles"
+                        data-ga-category="resources_accordion"
+                        data-ga-action="open_cited_by"
+                        data-ga-label="/articles/PMC8885717/"
+                        data-action-open="open_cited_by"
+                        data-action-close="close_cited_by"
+                        >
+                             Cited by other articles
+                        </button>
+                    </h3>
+                    <div
+                            id="resources-cited-by-other-articles"
+                            class="usa-accordion__content usa-prose"
+                            
+                                data-source-url="/resources/cited-by-links/35243479/"
+                            
+                    >
+                          
+                    </div>
+                    
+                        <h3 class="usa-accordion__heading">
+                            <button
+                            type="button"
+                            class="usa-accordion__button"
+                            aria-expanded="false"
+                            aria-controls="resources-links-to-ncbi-databases"
+                            data-ga-category="resources_accordion"
+                            data-ga-action="open_NCBI_links"
+                            data-ga-label="/articles/PMC8885717/"
+                            data-action-open="open_NCBI_links"
+                            data-action-close="close_NCBI_link"
+                            >
+                                 Links to NCBI Databases
+                            </button>
+                        </h3>
+                        <div
+                                id="resources-links-to-ncbi-databases"
+                                class="usa-accordion__content usa-prose"
+                                data-source-url="/resources/db-links/8885717/"
+                        >
+                        </div>
+                    
+                    
+                </div>
+            
+        </section>
+
+
+        <section
+        class="usa-in-page-nav usa-in-page-nav--wide margin-top-6 desktop:margin-top-4"
+        data-title-text="On this page"
+        data-title-heading-level="h2"
+        data-scroll-offset="0"
+        data-root-margin="-10% 0px -80% 0px"
+        data-main-content-selector="main"
+        data-threshold="1"
+        hidden
+        ></section>
+    </section>
+</div>
+
+
+        
+
+<div class="overlay" role="dialog" aria-label="Citation Dialog" hidden>
+    <div class="dialog citation-dialog" aria-hidden="true">
+        <div class="display-inline-flex flex-align-center flex-justify width-full margin-bottom-2">
+            <h2 class="usa-modal__heading margin-0">Cite</h2>
+             <button type="button" class="usa-button usa-button--unstyled close-overlay text-black width-auto"  tabindex="1">
+                <svg class="usa-icon width-3 height-3" aria-hidden="true" focusable="false" role="img">
+                    <use xlink:href="/static/img/sprite.svg#close"></use>
+                </svg>
+             </button>
+        </div>
+
+        
+
+<div class="citation-text-block">
+  <div class="citation-text margin-bottom-2"></div>
+  <ul class="usa-list usa-list--unstyled display-inline-flex flex-justify width-full flex-align-center">
+      <li>
+        <button
+          class="usa-button usa-button--unstyled text-no-underline display-flex flex-align-center copy-button dialog-focus"
+          data-ga-category="save_share"
+          data-ga-action="cite"
+          data-ga-label="copy"
+          tabindex="2">
+            <svg class="usa-icon width-3 height-3" aria-hidden="true" focusable="false" role="img">
+                <use xlink:href="/static/img/sprite.svg#content_copy"></use>
+            </svg>
+            <span>Copy</span>
+        </button>
+      </li>
+      <li>
+          <a
+              href="#"
+              role="button"
+              class="usa-button usa-button--unstyled text-no-underline display-flex flex-align-center export-button"
+              data-ga-category="save_share"
+              data-ga-action="cite"
+              data-ga-label="download"
+              title="Download a file for external citation management software"
+              tabindex="3">
+                <svg class="usa-icon width-3 height-3" aria-hidden="true" focusable="false" role="img">
+                    <use xlink:href="/static/img/sprite.svg#file_download"></use>
+                </svg>
+                <span class="display-none mobile-lg:display-inline">Download .nbib</span>
+                <span class="display-inline mobile-lg:display-none">.nbib</span>
+            </a>
+      </li>
+      <li>
+          
+
+<div class="display-inline-flex flex-align-center">
+  <label class="usa-label margin-top-0">Format:</label>
+  <select aria-label="Format" class="usa-select citation-style-selector padding-1 margin-top-0 border-0 padding-right-4" tabindex="4" >
+    
+      <option data-style-url-name="ama"
+              value="AMA"
+              >
+        AMA
+      </option>
+    
+      <option data-style-url-name="apa"
+              value="APA"
+              >
+        APA
+      </option>
+    
+      <option data-style-url-name="mla"
+              value="MLA"
+              >
+        MLA
+      </option>
+    
+      <option data-style-url-name="nlm"
+              value="NLM"
+              selected="selected">
+        NLM
+      </option>
+    
+  </select>
+</div>
+      </li>
+  </ul>
+</div>
+    </div>
+</div>
+
+        <div class="overlay" role="dialog" hidden>
+  <div id="collections-action-dialog" class="dialog collections-dialog" aria-hidden="true">
+   <div class="display-inline-flex flex-align-center flex-justify width-full margin-bottom-2">
+        <h2 class="usa-modal__heading margin-0">Add to Collections</h2>
+    </div>
+    <div class="collections-action-panel action-panel">
+      
+
+
+<form id="collections-action-dialog-form"
+      class="usa-form maxw-full collections-action-panel-form action-panel-content action-form action-panel-smaller-selectors"
+      data-existing-collections-url="/list-existing-collections/"
+      data-add-to-existing-collection-url="/add-to-existing-collection/"
+      data-create-and-add-to-new-collection-url="/create-and-add-to-new-collection/"
+      data-myncbi-max-collection-name-length="100"
+      data-collections-root-url="https://www.ncbi.nlm.nih.gov/myncbi/collections/">
+
+    <input type="hidden" name="csrfmiddlewaretoken" value="Sc5Uabcv1mD0LIMLbrgPyYxCBng0IUIedKpaaHimCv2eq1NRPeHHXS5UBkercYIT">
+
+    <fieldset class="usa-fieldset margin-bottom-2">
+        <div class="usa-radio">
+            <input type="radio"
+            id="collections-action-dialog-new"
+            class="usa-radio__input usa-radio__input--tile collections-new  margin-top-0"
+            name="collections"
+            value="new"
+            data-ga-category="collections_button"
+            data-ga-action="click"
+            data-ga-label="collections_radio_new" />
+            <label class="usa-radio__label" for="collections-action-dialog-new">Create a new collection</label>
+        </div>
+        <div class="usa-radio">
+            <input type="radio"
+            id="collections-action-dialog-existing"
+            class="usa-radio__input usa-radio__input--tile collections-existing"
+            name="collections"
+            value="existing"
+            checked="true"
+            data-ga-category="collections_button"
+            data-ga-action="click"
+            data-ga-label="collections_radio_existing" />
+            <label class="usa-radio__label" for="collections-action-dialog-existing">Add to an existing collection</label>
+        </div>
+    </fieldset>
+
+    <fieldset class="usa-fieldset margin-bottom-2">
+        <div class="action-panel-control-wrap new-collections-controls">
+           <label for="collections-action-dialog-add-to-new" class="usa-label margin-top-0">
+                Name your collection
+               <abbr title="required" class="usa-hint usa-hint--required text-no-underline">*</abbr>
+          </label>
+          <input
+            type="text"
+            name="add-to-new-collection"
+            id="collections-action-dialog-add-to-new"
+            class="usa-input collections-action-add-to-new"
+            pattern="[^&quot;&amp;=&lt;&gt;/]*" title="The following characters are not allowed in the Name field: &quot;&amp;=&lt;&gt;/"
+            maxlength=""
+            data-ga-category="collections_button"
+            data-ga-action="create_collection"
+            data-ga-label="non_favorties_collection"
+            required
+          />
+        </div>
+        <div class="action-panel-control-wrap existing-collections-controls">
+             <label for="collections-action-dialog-add-to-existing" class="usa-label margin-top-0">
+                Choose a collection
+              </label>
+              <select id="collections-action-dialog-add-to-existing"
+                      class="usa-select collections-action-add-to-existing"
+                      data-ga-category="collections_button"
+                      data-ga-action="select_collection"
+                      data-ga-label="($('.collections-action-add-to-existing').val() === 'Favorites') ? 'Favorites' : 'non_favorites_collection'">
+              </select>
+              <div class="collections-retry-load-on-error usa-input-error-message selection-validation-message">
+                Unable to load your collection due to an error<br>
+                <a href="#">Please try again</a>
+              </div>
+        </div>
+    </fieldset>
+
+    <div class="display-inline-flex">
+        <button class="usa-button margin-top-0 action-panel-submit"
+            type="submit"
+            data-loading-label="Adding..."
+            data-pinger-ignore
+            data-ga-category="collections_button"
+            data-ga-action="click"
+            data-ga-label="add">
+          Add
+        </button>
+        <button class="usa-button usa-button--outline margin-top-0 action-panel-cancel"
+                aria-label="Close 'Add to Collections' panel"
+                ref="linksrc=close_collections_panel"
+                data-ga-category="collections_button"
+                data-ga-action="click"
+                data-ga-label="cancel">
+          Cancel
+        </button>
+    </div>
+</form>
+    </div>
+  </div>
+</div>
+
+        
+
+      </div>
+    </div>
+  </div>
+
+
+
+        
+    
+
+<footer class="ncbi-footer ncbi-dark-background " >
+    
+        <div class="ncbi-footer__icon-section">
+            <div class="ncbi-footer__social-header">
+                Follow NCBI
+            </div>
+
+            <div class="grid-container ncbi-footer__ncbi-social-icons-container">
+                
+                    <a href="https://twitter.com/ncbi"
+                       class="ncbi-footer__social-icon ncbi-footer__social-icon--gray"
+                       target="_blank"
+                       rel="noreferrer noopener">
+                        <svg width="40"
+                             height="40"
+                             viewBox="0 0 40 40"
+                             fill="none"
+                             xmlns="http://www.w3.org/2000/svg"
+                             focusable="false"
+                             aria-hidden="true">
+                            <path d="m6.067 8 10.81 13.9L6 33.2h4.2l8.4-9.1 7.068 9.1H34L22.8 18.5 31.9 8h-3.5l-7.7 8.4L14.401 8H6.067Zm3.6 1.734h3.266l16.8 21.732H26.57L9.668 9.734Z">
+                            </path>
+                        </svg>
+                        <span class="usa-sr-only">NCBI on X (formerly known as Twitter)</span>
+                    </a>
+                
+
+                
+                    <a href="https://www.facebook.com/ncbi.nlm"
+                       class="ncbi-footer__social-icon ncbi-footer__social-icon--gray"
+                       target="_blank"
+                       rel="noreferrer noopener">
+                        <svg width="16"
+                             height="29"
+                             focusable="false"
+                             aria-hidden="true"
+                             viewBox="0 0 16 29"
+                             fill="none"
+                             xmlns="http://www.w3.org/2000/svg">
+                            <path d="M3.8809 21.4002C3.8809 19.0932 3.8809 16.7876 3.8809 14.478C3.8809 14.2117 3.80103 14.1452 3.54278 14.1492C2.53372 14.1638 1.52334 14.1492 0.514288 14.1598C0.302626 14.1598 0.248047 14.0972 0.248047 13.8936C0.256034 12.4585 0.256034 11.0239 0.248047 9.58978C0.248047 9.37013 0.302626 9.30224 0.528931 9.3049C1.53798 9.31688 2.54837 9.3049 3.55742 9.31555C3.80103 9.31555 3.8809 9.26097 3.87957 9.00272C3.87158 8.00565 3.85428 7.00592 3.90753 6.00884C3.97142 4.83339 4.31487 3.73115 5.04437 2.78467C5.93095 1.63318 7.15699 1.09005 8.56141 0.967577C10.5582 0.79319 12.555 0.982221 14.5518 0.927641C14.7102 0.927641 14.7462 0.99287 14.7449 1.13664C14.7449 2.581 14.7449 4.02668 14.7449 5.47104C14.7449 5.67604 14.6517 5.68669 14.4946 5.68669C13.4523 5.68669 12.4113 5.68669 11.3703 5.68669C10.3506 5.68669 9.92057 6.10868 9.90593 7.13904C9.89661 7.7647 9.91525 8.39303 9.89794 9.01869C9.88995 9.26364 9.96583 9.31822 10.2015 9.31688C11.7204 9.30623 13.2393 9.31688 14.7595 9.3049C15.0257 9.3049 15.0723 9.3728 15.0444 9.62439C14.89 10.9849 14.7515 12.3467 14.6144 13.7085C14.5691 14.1571 14.5785 14.1585 14.1458 14.1585C12.8386 14.1585 11.5313 14.1665 10.2254 14.1518C9.95119 14.1518 9.89794 14.2317 9.89794 14.4899C9.90593 19.0799 9.89794 23.6752 9.91125 28.2612C9.91125 28.5674 9.8407 28.646 9.53186 28.6433C7.77866 28.6273 6.02414 28.6366 4.27094 28.634C3.82499 28.634 3.87158 28.6992 3.87158 28.22C3.87602 25.9472 3.87913 23.6739 3.8809 21.4002Z">
+                            </path>
+                        </svg>
+                        <span class="usa-sr-only">NCBI on Facebook</span>
+                    </a>
+                
+
+                
+                    <a href="https://www.linkedin.com/company/ncbinlm"
+                       class="ncbi-footer__social-icon ncbi-footer__social-icon--gray"
+                       target="_blank"
+                       rel="noreferrer noopener">
+                        <svg width="25"
+                             height="23"
+                             viewBox="0 0 26 24"
+                             fill="none"
+                             xmlns="http://www.w3.org/2000/svg"
+                             focusable="false"
+                             aria-hidden="true">
+                            <path d="M14.6983 9.98423C15.6302 9.24808 16.5926 8.74754 17.6762 8.51991C19.673 8.09126 21.554 8.30824 23.1262 9.7526C24.2351 10.7723 24.7529 12.1115 25.0165 13.5612C25.1486 14.3363 25.2105 15.1218 25.2015 15.9081C25.2015 18.3043 25.2015 20.6898 25.2082 23.0806C25.2082 23.3468 25.1549 23.444 24.8621 23.4414C23.1297 23.4272 21.3992 23.4272 19.6704 23.4414C19.4041 23.4414 19.3429 23.3588 19.3442 23.1019C19.3535 20.5194 19.3442 17.9368 19.3442 15.3543C19.3442 14.0005 18.3258 12.9448 17.0266 12.9488C15.7273 12.9528 14.6983 14.0071 14.6983 15.361C14.6983 17.9328 14.6917 20.5047 14.6983 23.0753C14.6983 23.3708 14.6198 23.444 14.3296 23.4427C12.6185 23.4294 10.9079 23.4294 9.19779 23.4427C8.93155 23.4427 8.86099 23.3735 8.86232 23.1086C8.8783 19.7619 8.88628 16.4144 8.88628 13.066C8.88628 11.5688 8.87874 10.0708 8.86365 8.57182C8.86365 8.3575 8.90758 8.27896 9.14054 8.28029C10.9048 8.29094 12.6687 8.29094 14.4321 8.28029C14.6464 8.28029 14.6983 8.34818 14.6983 8.54653C14.6903 9.00047 14.6983 9.45441 14.6983 9.98423Z">
+                            </path>
+                            <path d="M6.55316 15.8443C6.55316 18.2564 6.55316 20.6699 6.55316 23.082C6.55316 23.3629 6.48127 23.4388 6.19906 23.4374C4.47737 23.4241 2.75568 23.4241 1.03399 23.4374C0.767751 23.4374 0.69986 23.3629 0.701191 23.1006C0.709178 18.2648 0.709178 13.4281 0.701191 8.59053C0.701191 8.34026 0.765089 8.27237 1.01669 8.2737C2.74991 8.28435 4.48048 8.28435 6.20838 8.2737C6.47462 8.2737 6.5465 8.33627 6.54517 8.6065C6.54783 11.0186 6.55316 13.4308 6.55316 15.8443Z">
+                            </path>
+                            <path d="M3.65878 0.243898C5.36804 0.243898 6.58743 1.45529 6.58743 3.1406C6.58743 4.75801 5.32145 5.95742 3.60819 5.96807C3.22177 5.97614 2.83768 5.90639 2.47877 5.76299C2.11985 5.61959 1.79344 5.40546 1.51897 5.13334C1.24449 4.86123 1.02755 4.53668 0.881058 4.17902C0.734563 3.82136 0.661505 3.43788 0.666231 3.05141C0.67555 1.42601 1.9362 0.242566 3.65878 0.243898Z">
+                            </path>
+                        </svg>
+                        <span class="usa-sr-only">NCBI on LinkedIn</span>
+                    </a>
+                
+
+                
+                    <a href="https://github.com/ncbi"
+                       class="ncbi-footer__social-icon ncbi-footer__social-icon--gray"
+                       target="_blank"
+                       rel="noreferrer noopener">
+                        <svg width="28"
+                             height="27"
+                             viewBox="0 0 28 28"
+                             fill="none"
+                             xmlns="http://www.w3.org/2000/svg"
+                             focusable="false"
+                             aria-hidden="true">
+                            <path d="M16.7228 20.6334C17.5057 20.5527 18.2786 20.3944 19.0301 20.1608C21.3108 19.4193 22.5822 17.8259 22.963 15.4909C23.1228 14.5112 23.1814 13.5287 22.9883 12.5437C22.8106 11.6423 22.4013 10.8028 21.8007 10.1076C21.7526 10.0605 21.7197 10 21.7064 9.934C21.6931 9.86799 21.7 9.79952 21.7262 9.73748C22.0856 8.6206 21.9711 7.51969 21.601 6.42677C21.582 6.3497 21.5345 6.2827 21.468 6.23923C21.4016 6.19577 21.3211 6.17906 21.2429 6.19248C20.7329 6.21649 20.2313 6.33051 19.7611 6.52928C19.1103 6.7908 18.4899 7.12198 17.9104 7.51703C17.84 7.56996 17.7581 7.60551 17.6713 7.62078C17.5846 7.63605 17.4954 7.6306 17.4112 7.60489C15.2596 7.05882 13.0054 7.06203 10.8554 7.61421C10.7806 7.63586 10.7018 7.63967 10.6253 7.62534C10.5487 7.611 10.4766 7.57892 10.4148 7.53167C9.64788 7.03247 8.85171 6.58918 7.96368 6.33359C7.65781 6.24338 7.34123 6.19458 7.02239 6.18849C6.94879 6.17986 6.87462 6.19893 6.81432 6.242C6.75402 6.28507 6.71191 6.34904 6.69621 6.42145C6.32342 7.51437 6.2209 8.61527 6.56307 9.73348C6.59635 9.84264 6.64694 9.93316 6.54177 10.0516C5.47666 11.2604 5.09988 12.6834 5.19574 14.2676C5.2663 15.4244 5.46201 16.5466 6.01454 17.5769C6.84399 19.1171 8.21664 19.9119 9.85158 20.3352C10.3938 20.4706 10.9444 20.5698 11.4998 20.632C11.5384 20.7492 11.4506 20.7798 11.408 20.8291C11.1734 21.1179 10.9894 21.4441 10.8634 21.7942C10.7622 22.0458 10.8315 22.4039 10.6065 22.5516C10.263 22.7766 9.83827 22.8485 9.42421 22.8871C8.17936 23.0056 7.26471 22.4877 6.6283 21.4348C6.25552 20.8184 5.76956 20.3325 5.08523 20.0663C4.76981 19.9325 4.42139 19.8967 4.08537 19.9638C3.7898 20.029 3.73788 20.1901 3.93891 20.4111C4.03639 20.5234 4.14989 20.6207 4.27575 20.6999C4.9796 21.1318 5.51717 21.7884 5.80152 22.5636C6.37002 23.9973 7.48039 24.5697 8.93825 24.6323C9.43741 24.6575 9.93768 24.615 10.4254 24.5058C10.5892 24.4672 10.6531 24.4872 10.6517 24.6762C10.6451 25.4936 10.6637 26.3123 10.6517 27.131C10.6517 27.6635 10.1684 27.9297 9.58663 27.7393C8.17396 27.2671 6.84977 26.5631 5.66838 25.656C2.59555 23.2891 0.720966 20.1861 0.217704 16.3376C-0.357453 11.9127 0.911353 8.00824 3.98551 4.73881C6.11909 2.42656 8.99932 0.939975 12.1203 0.540191C16.5351 -0.0601815 20.4347 1.14323 23.7232 4.16373C26.2449 6.47869 27.724 9.37672 28.1048 12.7726C28.5828 17.0325 27.3686 20.7945 24.4768 23.9827C22.9762 25.6323 21.0956 26.8908 18.9982 27.6488C18.8783 27.6927 18.7585 27.738 18.636 27.7726C18.0356 27.9404 17.6189 27.6395 17.6189 27.0098C17.6189 25.7452 17.6308 24.4806 17.6295 23.2159C17.6329 22.9506 17.6128 22.6856 17.5696 22.4238C17.4325 21.6664 17.3419 21.484 16.7228 20.6334Z">
+                            </path>
+                        </svg>
+                        <span class="usa-sr-only">NCBI on GitHub</span>
+                    </a>
+                
+
+                
+                    <a href="https://ncbiinsights.ncbi.nlm.nih.gov/"
+                       class="ncbi-footer__social-icon ncbi-footer__social-icon--gray"
+                       target="_blank"
+                       rel="noreferrer noopener">
+                        <svg width="26"
+                             height="26"
+                             viewBox="0 0 27 27"
+                             fill="none"
+                             xmlns="http://www.w3.org/2000/svg"
+                             focusable="false"
+                             aria-hidden="true">
+                            <path d="M23.7778 26.4574C23.1354 26.3913 22.0856 26.8024 21.636 26.3087C21.212 25.8444 21.4359 24.8111 21.324 24.0347C19.9933 14.8323 14.8727 8.80132 6.09057 5.85008C4.37689 5.28406 2.58381 4.99533 0.779072 4.99481C0.202773 4.99481 -0.0229751 4.83146 0.00455514 4.21479C0.0660406 3.08627 0.0660406 1.95525 0.00455514 0.826734C-0.0413285 0.0815827 0.259669 -0.0193618 0.896534 0.00266238C6.96236 0.222904 12.3693 2.24179 16.9889 6.16209C22.9794 11.2478 26.1271 17.7688 26.4372 25.648C26.4629 26.294 26.3179 26.5271 25.6609 26.4684C25.0827 26.417 24.4991 26.4574 23.7778 26.4574Z">
+                            </path>
+                            <path d="M14.8265 26.441C14.0924 26.441 13.2371 26.6795 12.6626 26.3786C12.0092 26.0372 12.3781 25.0644 12.246 24.378C11.1154 18.5324 6.6849 14.5497 0.74755 14.1001C0.217135 14.0615 -0.0104482 13.9422 0.0134113 13.3659C0.0519536 12.1454 0.0482829 10.9213 0.0134113 9.69524C-0.00127145 9.14464 0.196946 9.03268 0.703502 9.04736C9.21217 9.27128 16.5994 16.2511 17.2804 24.7231C17.418 26.4446 17.418 26.4446 15.6579 26.4446H14.832L14.8265 26.441Z">
+                            </path>
+                            <path d="M3.58928 26.4555C2.64447 26.4618 1.73584 26.0925 1.06329 25.4289C0.39073 24.7653 0.00933763 23.8617 0.0030097 22.9169C-0.00331824 21.9721 0.365937 21.0635 1.02954 20.3909C1.69315 19.7184 2.59675 19.337 3.54156 19.3306C4.48637 19.3243 5.39499 19.6936 6.06755 20.3572C6.7401 21.0208 7.1215 21.9244 7.12782 22.8692C7.13415 23.814 6.7649 24.7226 6.10129 25.3952C5.43768 26.0677 4.53409 26.4491 3.58928 26.4555Z">
+                            </path>
+                        </svg>
+                        <span class="usa-sr-only">NCBI RSS feed</span>
+                    </a>
+                
+            </div>
+        </div>
+    
+
+    <div data-testid="gridContainer"
+         class="grid-container ncbi-footer__container">
+        <div class="grid-row ncbi-footer__main-content-container"
+             data-testid="grid">
+            
+                <div class="ncbi-footer__column">
+                    
+                        <p class="ncbi-footer__circled-icons-heading">
+                            Connect with NLM
+                        </p>
+                    
+
+                    <div class="ncbi-footer__circled-icons-list">
+                        
+                            <a href=https://twitter.com/nlm_nih class="ncbi-footer__social-icon ncbi-footer__social-icon--circled" target="_blank" rel="noreferrer noopener">
+                                <svg width="32"
+                                     height="32"
+                                     viewBox="0 0 40 40"
+                                     fill="none"
+                                     xmlns="http://www.w3.org/2000/svg"
+                                     focusable="false"
+                                     aria-hidden="true">
+                                    <path d="m6.067 8 10.81 13.9L6 33.2h4.2l8.4-9.1 7.068 9.1H34L22.8 18.5 31.9 8h-3.5l-7.7 8.4L14.401 8H6.067Zm3.6 1.734h3.266l16.8 21.732H26.57L9.668 9.734Z">
+                                    </path>
+                                </svg>
+                                <span class="usa-sr-only">NLM on X (formerly known as Twitter)</span>
+                            </a>
+                        
+
+                        
+                            <a href=https://www.facebook.com/nationallibraryofmedicine class="ncbi-footer__social-icon ncbi-footer__social-icon--circled" target="_blank" rel="noreferrer noopener">
+                                <svg width="13"
+                                     height="24"
+                                     viewBox="0 0 13 24"
+                                     fill="none"
+                                     xmlns="http://www.w3.org/2000/svg"
+                                     focusable="false"
+                                     aria-hidden="true">
+                                    <path d="M4.11371 23.1369C4.11371 23.082 4.11371 23.0294 4.11371 22.9745V12.9411H0.817305C0.6709 12.9411 0.670898 12.9411 0.670898 12.8016C0.670898 11.564 0.670898 10.3287 0.670898 9.09341C0.670898 8.97903 0.705213 8.95158 0.815017 8.95158C1.8673 8.95158 2.91959 8.95158 3.97417 8.95158H4.12057V8.83263C4.12057 7.8055 4.12057 6.7738 4.12057 5.74897C4.1264 4.92595 4.31387 4.11437 4.66959 3.37217C5.12916 2.38246 5.94651 1.60353 6.95717 1.1921C7.64827 0.905008 8.3913 0.764035 9.13953 0.778051C10.0019 0.791777 10.8644 0.830666 11.7268 0.860404C11.8869 0.860404 12.047 0.894717 12.2072 0.90158C12.2964 0.90158 12.3261 0.940469 12.3261 1.02968C12.3261 1.5421 12.3261 2.05452 12.3261 2.56465C12.3261 3.16857 12.3261 3.7725 12.3261 4.37642C12.3261 4.48165 12.2964 4.51367 12.1912 4.51138C11.5369 4.51138 10.8804 4.51138 10.2261 4.51138C9.92772 4.51814 9.63058 4.5526 9.33855 4.61433C9.08125 4.6617 8.84537 4.78881 8.66431 4.97766C8.48326 5.16652 8.3662 5.40755 8.32972 5.66661C8.28476 5.89271 8.26027 6.1224 8.25652 6.35289C8.25652 7.19014 8.25652 8.02969 8.25652 8.86923C8.25652 8.89439 8.25652 8.91955 8.25652 8.95615H12.0219C12.1797 8.95615 12.182 8.95616 12.1614 9.10714C12.0768 9.76596 11.9876 10.4248 11.9029 11.0813C11.8312 11.6319 11.7626 12.1824 11.697 12.733C11.6719 12.9434 11.6787 12.9434 11.4683 12.9434H8.26338V22.899C8.26338 22.979 8.26338 23.0591 8.26338 23.1392L4.11371 23.1369Z">
+                                    </path>
+                                </svg>
+                                <span class="usa-sr-only">NLM on Facebook</span>
+                            </a>
+                        
+
+                        
+                            <a href=https://www.youtube.com/user/NLMNIH class="ncbi-footer__social-icon ncbi-footer__social-icon--circled" target="_blank" rel="noreferrer noopener">
+                                <svg width="21"
+                                     height="15"
+                                     viewBox="0 0 21 15"
+                                     fill="none"
+                                     xmlns="http://www.w3.org/2000/svg"
+                                     focusable="false"
+                                     aria-hidden="true">
+                                    <path d="M19.2561 1.47914C18.9016 1.15888 18.5699 0.957569 17.2271 0.834039C15.5503 0.678484 13.2787 0.655608 11.563 0.65332H9.43556C7.71987 0.65332 5.4483 0.678484 3.77151 0.834039C2.43098 0.957569 2.097 1.15888 1.74242 1.47914C0.813665 2.32097 0.619221 4.62685 0.598633 6.89384C0.598633 7.31781 0.598633 7.74101 0.598633 8.16345C0.626084 10.4121 0.827391 12.686 1.74242 13.521C2.097 13.8412 2.4287 14.0425 3.77151 14.1661C5.4483 14.3216 7.71987 14.3445 9.43556 14.3468H11.563C13.2787 14.3468 15.5503 14.3216 17.2271 14.1661C18.5676 14.0425 18.9016 13.8412 19.2561 13.521C20.1712 12.6929 20.3725 10.451 20.3999 8.22064C20.3999 7.74025 20.3999 7.25986 20.3999 6.77946C20.3725 4.54907 20.1689 2.30724 19.2561 1.47914ZM8.55942 10.5311V4.65201L13.5601 7.50005L8.55942 10.5311Z"
+                                          fill="white" />
+                                </svg>
+                                <span class="usa-sr-only">NLM on YouTube</span>
+                            </a>
+                        
+                    </div>
+                </div>
+            
+
+            
+                <address class="ncbi-footer__address ncbi-footer__column">
+                    
+        <p>
+            <a class="usa-link usa-link--external"
+            href="https://www.google.com/maps/place/8600+Rockville+Pike,+Bethesda,+MD+20894/%4038.9959508,
+            -77.101021,17z/data%3D!3m1!4b1!4m5!3m4!1s0x89b7c95e25765ddb%3A0x19156f88b27635b8!8m2!3d38.9959508!
+            4d-77.0988323"
+            rel="noopener noreferrer" target="_blank">National Library of Medicine
+            <br/> 8600 Rockville Pike<br/> Bethesda, MD 20894</a>
+        </p>
+    
+                </address>
+            
+
+            
+                <ul class="usa-list usa-list--unstyled ncbi-footer__vertical-list ncbi-footer__column">
+                    
+                        <li class="ncbi-footer__vertical-list-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.nlm.nih.gov/web_policies.html" class="usa-link  usa-link--alt ncbi-footer__link"  >
+    
+
+    Web Policies
+
+    
+</a>
+
+                        </li>
+                    
+                        <li class="ncbi-footer__vertical-list-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/freedom-information-act-office" class="usa-link  usa-link--alt ncbi-footer__link"  >
+    
+
+    FOIA
+
+    
+</a>
+
+                        </li>
+                    
+                        <li class="ncbi-footer__vertical-list-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html" class="usa-link usa-link--external usa-link--alt ncbi-footer__link" rel="noreferrer noopener" target='_blank' >
+    
+
+    HHS Vulnerability Disclosure
+
+    
+</a>
+
+                        </li>
+                    
+                </ul>
+            
+
+            
+                <ul class="usa-list usa-list--unstyled ncbi-footer__vertical-list ncbi-footer__column">
+                    
+                        <li class="ncbi-footer__vertical-list-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://support.nlm.nih.gov/" class="usa-link  usa-link--alt ncbi-footer__link"  >
+    
+
+    Help
+
+    
+</a>
+
+                        </li>
+                    
+                        <li class="ncbi-footer__vertical-list-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.nlm.nih.gov/accessibility.html" class="usa-link  usa-link--alt ncbi-footer__link"  >
+    
+
+    Accessibility
+
+    
+</a>
+
+                        </li>
+                    
+                        <li class="ncbi-footer__vertical-list-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.nlm.nih.gov/careers/careers.html" class="usa-link  usa-link--alt ncbi-footer__link"  >
+    
+
+    Careers
+
+    
+</a>
+
+                        </li>
+                    
+                </ul>
+            
+        </div>
+
+        
+            <div class="grid-row grid-col-12" data-testid="grid">
+                <ul class="ncbi-footer__bottom-links-list">
+                    
+                        <li class="ncbi-footer__bottom-list-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.nlm.nih.gov/" class="usa-link  usa-link--alt ncbi-footer__link"  >
+    
+
+    NLM
+
+    
+</a>
+
+                        </li>
+                    
+                        <li class="ncbi-footer__bottom-list-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.nih.gov/" class="usa-link  usa-link--alt ncbi-footer__link"  >
+    
+
+    NIH
+
+    
+</a>
+
+                        </li>
+                    
+                        <li class="ncbi-footer__bottom-list-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.hhs.gov/" class="usa-link usa-link--external usa-link--alt ncbi-footer__link" rel="noreferrer noopener" target='_blank' >
+    
+
+    HHS
+
+    
+</a>
+
+                        </li>
+                    
+                        <li class="ncbi-footer__bottom-list-item">
+                            
+
+
+
+
+
+
+
+
+<a href="https://www.usa.gov/" class="usa-link usa-link--external usa-link--alt ncbi-footer__link" rel="noreferrer noopener" target='_blank' >
+    
+
+    USA.gov
+
+    
+</a>
+
+                        </li>
+                    
+                </ul>
+            </div>
+        
+    </div>
+</footer>
+
+
+
+        
+        
+    
+  <script  type="text/javascript" src="https://cdn.ncbi.nlm.nih.gov/core/pinger/pinger.js"> </script>
+
+
+    
+        
+
+<button class="back-to-top" data-ga-category="pagination" data-ga-action="back_to_top">
+    <label>Back to Top</label>
+    <svg class="usa-icon order-0" aria-hidden="true" focusable="false" role="img">
+        <use xlink:href="/static/img/sprite.svg#arrow_upward"></use>
+    </svg>
+</button>
+    
+
+
+        
+     <script src="https://code.jquery.com/jquery-3.5.0.min.js"
+          integrity="sha256-xNzN2a4ltkB44Mc/Jz3pT4iU1cmeR0FkXs4pru/JxaQ="
+          crossorigin="anonymous">
+    </script>
+    <script type="text/javascript">var exports = {};</script>
+     <script src="/static/CACHE/js/output.13b077bc3ffd.js"></script>
+    
+    
+        
+    <script type="application/javascript">
+    window.ncbi = window.ncbi || {};
+    window.ncbi.pmc = window.ncbi.pmc || {};
+    window.ncbi.pmc.options = {
+        logLevel: 'INFO',
+        
+        staticEndpoint: '/static/',
+        
+        citeCookieName: 'pmc-cf',
+    };
+</script>
+    <script type="module" crossorigin="" src="/static/assets/base-9bea7450.js"></script>
+
+    <script type="module" crossorigin="" src="/static/assets/article-722d91a2.js"></script>
+    
+    
+
+    </body>
+</html>
diff --git a/tests/data/PMC8885717.html b/tests/data/PMC/Pre-Oct-2024/PMC8885717.html
similarity index 100%
rename from tests/data/PMC8885717.html
rename to tests/data/PMC/Pre-Oct-2024/PMC8885717.html
diff --git a/tests/data/PMC8885717_abbreviations.json b/tests/data/PMC/Pre-Oct-2024/PMC8885717_abbreviations.json
similarity index 96%
rename from tests/data/PMC8885717_abbreviations.json
rename to tests/data/PMC/Pre-Oct-2024/PMC8885717_abbreviations.json
index 6ba3369..cf26b8d 100644
--- a/tests/data/PMC8885717_abbreviations.json
+++ b/tests/data/PMC/Pre-Oct-2024/PMC8885717_abbreviations.json
@@ -5,7 +5,7 @@
   "documents": [
     {
       "id": "PMC8885717",
-      "inputfile": "tests/data/PMC8885717.html",
+      "inputfile": "tests/data/PMC/Pre-Oct-2024/PMC8885717.html",
       "passages": [
         {
           "text_short": "NLP",
diff --git a/tests/data/PMC8885717_bioc.json b/tests/data/PMC/Pre-Oct-2024/PMC8885717_bioc.json
similarity index 99%
rename from tests/data/PMC8885717_bioc.json
rename to tests/data/PMC/Pre-Oct-2024/PMC8885717_bioc.json
index 3280ede..53c29fa 100644
--- a/tests/data/PMC8885717_bioc.json
+++ b/tests/data/PMC/Pre-Oct-2024/PMC8885717_bioc.json
@@ -6,7 +6,7 @@
   "documents": [
     {
       "id": "PMC8885717",
-      "inputfile": "tests/data/PMC8885717.html",
+      "inputfile": "tests/data/PMC/Pre-Oct-2024/PMC8885717.html",
       "infons": {},
       "passages": [
         {
diff --git a/tests/data/PMC8885717_tables.json b/tests/data/PMC/Pre-Oct-2024/PMC8885717_tables.json
similarity index 99%
rename from tests/data/PMC8885717_tables.json
rename to tests/data/PMC/Pre-Oct-2024/PMC8885717_tables.json
index 030dd22..c64dede 100644
--- a/tests/data/PMC8885717_tables.json
+++ b/tests/data/PMC/Pre-Oct-2024/PMC8885717_tables.json
@@ -5,7 +5,7 @@
   "infons": {},
   "documents": [
     {
-      "inputfile": "tests/data/PMC8885717.html",
+      "inputfile": "tests/data/PMC/Pre-Oct-2024/PMC8885717.html",
       "id": "1",
       "infons": {},
       "passages": [
@@ -784,7 +784,7 @@
       ]
     },
     {
-      "inputfile": "tests/data/PMC8885717.html",
+      "inputfile": "tests/data/PMC/Pre-Oct-2024/PMC8885717.html",
       "id": "2",
       "infons": {},
       "passages": [
@@ -1143,7 +1143,7 @@
       ]
     },
     {
-      "inputfile": "tests/data/PMC8885717.html",
+      "inputfile": "tests/data/PMC/Pre-Oct-2024/PMC8885717.html",
       "id": "3",
       "infons": {},
       "passages": [
@@ -1325,7 +1325,7 @@
       ]
     },
     {
-      "inputfile": "tests/data/PMC8885717.html",
+      "inputfile": "tests/data/PMC/Pre-Oct-2024/PMC8885717.html",
       "id": "4",
       "infons": {},
       "passages": [
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 3443533..bc51c0a 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -13,23 +13,23 @@ def test_autoCORPus():
     from autocorpus.autoCORPus import autoCORPus
 
     with open(
-        Path(__file__).parent / "data" / "PMC8885717_abbreviations.json",
+        Path(__file__).parent / "data" / "PMC" / "Pre-Oct-2024" / "PMC8885717_abbreviations.json",
         encoding="utf-8",
     ) as f:
         expected_abbreviations = json.load(f)
     with open(
-        Path(__file__).parent / "data" / "PMC8885717_bioc.json", encoding="utf-8"
+        Path(__file__).parent / "data" / "PMC" / "Pre-Oct-2024" / "PMC8885717_bioc.json", encoding="utf-8"
     ) as f:
         expected_bioc = json.load(f)
     with open(
-        Path(__file__).parent / "data" / "PMC8885717_tables.json", encoding="utf-8"
+        Path(__file__).parent / "data" / "PMC" / "Pre-Oct-2024" / "PMC8885717_tables.json", encoding="utf-8"
     ) as f:
         expected_tables = json.load(f)
 
     auto_corpus = autoCORPus(
-        "configs/config_pmc.json",
-        base_dir="tests/data",
-        main_text="tests/data/PMC8885717.html",
+        "autocorpus/configs/config_pmc_pre_oct_2024.json",
+        base_dir="tests/data/PMC/Pre-Oct-2024",
+        main_text="tests/data/PMC/Pre-Oct-2024/PMC8885717.html",
     )
 
     abbreviations = auto_corpus.abbreviations

From c5d557e09f2f69d46ba4fb0b2d0ea7e532123c32 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 28 Nov 2024 18:47:56 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 autocorpus/configs/config_pmc.json |  9 +++------
 tests/test_regression.py           | 20 +++++++++++++++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/autocorpus/configs/config_pmc.json b/autocorpus/configs/config_pmc.json
index b00297c..5e2117b 100644
--- a/autocorpus/configs/config_pmc.json
+++ b/autocorpus/configs/config_pmc.json
@@ -2,12 +2,9 @@
     "config": {
         "references": {
             "data": {
-                "title": [
-                ],
-                "journal": [
-                ],
-                "volume": [
-                ]
+                "title": [],
+                "journal": [],
+                "volume": []
             },
             "defined-by": [
                 {
diff --git a/tests/test_regression.py b/tests/test_regression.py
index bc51c0a..df4c6d0 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -13,16 +13,30 @@ def test_autoCORPus():
     from autocorpus.autoCORPus import autoCORPus
 
     with open(
-        Path(__file__).parent / "data" / "PMC" / "Pre-Oct-2024" / "PMC8885717_abbreviations.json",
+        Path(__file__).parent
+        / "data"
+        / "PMC"
+        / "Pre-Oct-2024"
+        / "PMC8885717_abbreviations.json",
         encoding="utf-8",
     ) as f:
         expected_abbreviations = json.load(f)
     with open(
-        Path(__file__).parent / "data" / "PMC" / "Pre-Oct-2024" / "PMC8885717_bioc.json", encoding="utf-8"
+        Path(__file__).parent
+        / "data"
+        / "PMC"
+        / "Pre-Oct-2024"
+        / "PMC8885717_bioc.json",
+        encoding="utf-8",
     ) as f:
         expected_bioc = json.load(f)
     with open(
-        Path(__file__).parent / "data" / "PMC" / "Pre-Oct-2024" / "PMC8885717_tables.json", encoding="utf-8"
+        Path(__file__).parent
+        / "data"
+        / "PMC"
+        / "Pre-Oct-2024"
+        / "PMC8885717_tables.json",
+        encoding="utf-8",
     ) as f:
         expected_tables = json.load(f)