Skip to content

Commit

Permalink
Merge pull request #73 from internetarchive/issue-48
Browse files Browse the repository at this point in the history
Adding annotations from djfu file
  • Loading branch information
benwbrum committed May 24, 2024
2 parents 3313578 + ded506d commit 730568b
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 2 deletions.
7 changes: 6 additions & 1 deletion iiify/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from flask_caching import Cache
from iiif2 import iiif, web
from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \
purify_domain, cantaloupe_resolver, create_collection3, IsCollection
purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations
from .configs import options, cors, approot, cache_root, media_root, \
cache_expr, version, image_server, cache_timeouts
from urllib.parse import quote
Expand Down Expand Up @@ -191,6 +191,11 @@ def manifest3(identifier):
raise excpt
# abort(404)

@app.route('/iiif/<version>/annotations/<identifier>/<fileName>/<canvas_no>.json')
@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
def annnotations(version, identifier, fileName, canvas_no):
domain = purify_domain(request.args.get('domain', request.url_root))
return ldjsonify(create_annotations(version, identifier, fileName, canvas_no, domain=domain))

@app.route('/iiif/<identifier>/manifest.json')
@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
Expand Down
69 changes: 68 additions & 1 deletion iiify/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import requests
from iiif2 import iiif, web
from .configs import options, cors, approot, cache_root, media_root, apiurl, LINKS
from iiif_prezi3 import Manifest, config, Annotation, AnnotationPage, Canvas, Manifest, ResourceItem, ServiceItem, Choice, Collection, ManifestRef, CollectionRef
from iiif_prezi3 import Manifest, config, Annotation, AnnotationPage,AnnotationPageRef, Canvas, Manifest, ResourceItem, ServiceItem, Choice, Collection, ManifestRef, CollectionRef

from urllib.parse import urlparse, parse_qs, quote
import json
import math
import re
import xml.etree.ElementTree as ET

IMG_CTX = 'http://iiif.io/api/image/2/context.json'
PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
Expand Down Expand Up @@ -468,9 +470,12 @@ def create_manifest3(identifier, domain=None, page=None):
# subprefix can be different from the identifier use the scandata filename to find the correct prefix
# if not present fall back to identifier
subprefix = identifier
djvuFile = ""
for fileMd in metadata['files']:
if fileMd['name'].endswith('_scandata.xml'):
subprefix = fileMd['name'].replace('_scandata.xml', '')
if fileMd['format'] == 'Djvu XML':
djvuFile = fileMd['name']

bookReaderURL = f"https://{metadata.get('server')}/BookReader/BookReaderJSIA.php?id={identifier}&itemPath={metadata.get('dir')}&server={metadata.get('server')}&format=jsonp&subPrefix={subprefix}"

Expand Down Expand Up @@ -531,7 +536,20 @@ def create_manifest3(identifier, domain=None, page=None):
except:
pass

# Add annotations if djvu file is present
if djvuFile:
count = 1
for canvas in manifest.items:
if 'annotations' in canvas:
annotations = canvas.annotations
else:
annotations = []

annotations.append(
AnnotationPageRef(id=f"{domain}3/annotations/{identifier}/{quote(djvuFile, safe='()')}/{count}.json", type="AnnotationPage")
)
canvas.annotations = annotations
count += 1
elif mediatype == 'image':
(multiFile, format) = checkMultiItem(metadata)
print (f"Checking multiFile {multiFile} {format}")
Expand Down Expand Up @@ -710,6 +728,55 @@ def create_manifest3(identifier, domain=None, page=None):

return json.loads(manifest.jsonld())

def create_annotations(version, identifier, fileName, canvas_no, domain=None):
annotationPage = AnnotationPage(id=f"{domain}{version}/annotations/{identifier}/{quote(fileName, safe='()')}/{canvas_no}.json")
annotationPage.items = []
index = int(canvas_no) - 1
url = f"{ARCHIVE}/download/{identifier}/{fileName}"
try:
# Fetch the remote XML file
response = requests.get(url)
response.raise_for_status() # Raise an error for bad status codes

# Parse the XML content
djfu = ET.fromstring(response.content)
page = djfu.findall(f".//OBJECT[{canvas_no}]")[0]
words = page.findall(".//WORD")
count = 1
for word in words:
# <WORD coords="444,1353,635,1294" x-confidence="10">[David </WORD>
# <WORD coords="lx,by,rx,ty" x-confidence="10">[David </WORD>
# x = lx
# y = ty
# w = rx - lx
# h = by - ty
(left_x, bottom_y, right_x, top_y) = word.attrib['coords'].split(',')
x = left_x
y = top_y
width = int(right_x) - int(left_x)
height = int(bottom_y) - int(top_y)
annotationPage.items.append({
"id": f"https://iiif.archive.org/iiif/{identifier}/canvas/{index}/anno/{count}",
"type": "Annotation",
"motivation": "commenting",
"body": {
"type": "TextualBody",
"format": "text/plain",
"value": word.text
},
"target": f"https://iiif.archive.org/iiif/{identifier}${index}/canvas#xywh={x},{y},{width},{height}"
})
count += 1

except requests.exceptions.RequestException as e:
print(f"Error fetching the XML file: {e}")
raise ValueError("Failed to retrieve {url}")
except ET.ParseError as e:
print(f"Error parsing the XML content: {e}")
raise ValueError("Failed to process {url}")

return json.loads(annotationPage.jsonld())

def coerce_list(value):
if isinstance(value, list):
return ". ".join(value)
Expand Down
57 changes: 57 additions & 0 deletions tests/test_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import unittest
from flask.testing import FlaskClient
from iiify.app import app

class TestAnnotations(unittest.TestCase):

def setUp(self) -> None:
self.test_app = FlaskClient(app)

def test_v3_manifest_has_annotations(self):
resp = self.test_app.get("/iiif/3/journalofexpedit00ford/manifest.json?recache=true")
self.assertEqual(resp.status_code, 200)
manifest = resp.json

count = 1
for canvas in manifest['items']:
self.assertTrue('annotations' in canvas, f"Expected annotations in canvas {canvas['id']}")
annotations_url = f"https://localhost/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/{count}.json"
found=False
for anno in canvas['annotations']:
if anno['id'] == annotations_url:
found=True
self.assertFalse('items' in anno, "As a referenced AnnotationPage it shouldn't contain items.")
self.assertTrue('type' in anno and anno['type'] == "AnnotationPage",f"Expected annotation page to have a type {anno}")

self.assertTrue(found, f"Expected to find {annotations_url} in {canvas['annotations']}")
count += 1

def test_v3_annotations(self):
resp = self.test_app.get("/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/1.json?recache=true")
self.assertEqual(resp.status_code, 200)
annotations = resp.json

self.assertEqual(annotations['id'], "https://localhost/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/1.json", "Unexpected id")
self.assertEqual(annotations['@context'], "http://iiif.io/api/presentation/3/context.json", "Unexpected context")
self.assertEqual(annotations['type'], "AnnotationPage", "Unexpected type, expected AnnotationPage")
annotationList = annotations['items']
self.assertEqual(len(annotationList), 6, "Unexpected number of annotations")

ids = []
first=True
for anno in annotationList:
self.assertTrue(anno['id'] not in ids,"Duplicate ID: {anno['id']}")
ids.append(anno['id'])
self.assertEqual(anno['type'], "Annotation", "Expected type of Annotation")
self.assertTrue("body" in anno and "target" in anno, "Body or target missing from annotation {anno}")
self.assertEqual(anno['body']['type'], "TextualBody", "Expected body to be a TextualBody")
self.assertEqual(anno['body']['format'], "text/plain", "Expected format to be a text/plain")
self.assertEqual(anno['target'].split('#')[0], "https://iiif.archive.org/iiif/journalofexpedit00ford$0/canvas")
if first:
self.assertEqual(anno['target'].split('#')[1],"xywh=592,1742,460,118")
self.assertEqual(anno['body']['value'],"JOURNAL ")

self.assertEqual(anno['motivation'], "supplementing", "Expected motivation of supplementing")
first=False


0 comments on commit 730568b

Please sign in to comment.