# JSON schema of full text documents { "paper_id": , # 40-character sha1 of the PDF "metadata": { "title": , "authors": [ # list of author dicts, in order { "first": , "middle": , "last": , "suffix": , "affiliation": , "email": }, ... ], "abstract": [ # list of paragraphs in the abstract { "text": , "cite_spans": [ # list of character indices of inline citations # e.g. citation "[7]" occurs at positions 151-154 in "text" # linked to bibliography entry BIBREF3 { "start": 151, "end": 154, "text": "[7]", "ref_id": "BIBREF3" }, ... ], "ref_spans": , # e.g. inline reference to "Table 1" "section": "Abstract" }, ... ], "body_text": [ # list of paragraphs in full body # paragraph dicts look the same as above { "text": , "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Introduction" }, ... { ..., "section": "Conclusion" } ], "bib_entries": { "BIBREF0": { "ref_id": , "title": , "authors": # same structure as earlier, # but without `affiliation` or `email` "year": , "venue": , "volume": , "issn": , "pages": , "other_ids": { "DOI": [ ] } }, "BIBREF1": {}, ... "BIBREF25": {} }, "ref_entries": "FIGREF0": { "text": , # figure caption text "type": "figure" }, ... "TABREF13": { "text": , # table caption text "type": "table" } }, "back_matter": # same structure as body_text } }