From 7b2b16af1bc952d6f283a72bebf7becacedbd748 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anton=20Luka=20=C5=A0ijanec?= Date: Tue, 15 Nov 2022 18:35:22 +0100 Subject: use DateTime instead of ISO 8601, added missing duration to db --- app.py | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100755 app.py (limited to 'app.py') diff --git a/app.py b/app.py new file mode 100755 index 0000000..198eff9 --- /dev/null +++ b/app.py @@ -0,0 +1,193 @@ +#!/usr/bin/python3 +from sys import argv +import logging +from time import localtime, mktime, time +import requests +from base64 import b64decode +from datetime import datetime, timedelta, timezone +try: + from sqlalchemy import Table, MetaData, Integer, BigInteger, String, Column, Table, ForeignKey, create_engine, select, DateTime + from sqlalchemy.orm import declarative_base, relationship, Session +except ModuleNotFoundError: + raise ModuleNotFoundError("emerge dev-python/sqlalchemy or pip install SQLAlchemy") +try: + from bs4 import BeautifulSoup, FeatureNotFound +except ModuleNotFoundError: + raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4") + +operator_contact = argv[2] + +Base = declarative_base() + +class Book(Base): + __tablename__ = "books" + isbn = Column(BigInteger, primary_key=True, nullable=False, doc="book isbn. found in URL http://www/isbn/978 and in acsm: resource, dc:identifier (sometimes not), thumbnailURL") + title = Column(String, nullable=True, doc="title of the book, dcc:title in acsm") + creator = Column(String, nullable=True, doc="author of the book, dc:creator in acsm") + publisher = Column(String, nullable=True, doc="publisher of the book, dc:publisher in acsm") + identifier = Column(String, nullable=True, doc="if dc:identifier can't be derived from isbn, it's stored here. if dc:identifier element is missing, a literal string noidentifier is stored.") + thumbnail_extension = Column(String, nullable=True, doc="thumbnails come in predictable URLs, derived from ISBN, apart from the extension. I've observed both jpg and png, may be None if there's no thumbnailURL element") + format = Column(String, nullable=True, doc="format of the file. I've seen application/pdf and application/epub+zip") + language = Column(String, nullable=True, doc="language of the book. I've seen sl.") + borrows = relationship("Borrow", back_populates="book"); + def __repr__(self): + return f"Book(isbn={self.isbn!r}, title={self.title!r}, creator={self.creator!r}, publisher={self.publisher!r})" + +class Borrow(Base): + __tablename__ = "borrows" + id = Column(Integer, primary_key=True, nullable=False, doc="id in transaction element of acsm or in filename of acsm on http") + isbn = Column(ForeignKey("books.isbn"), nullable=False, doc="foreign key that leads to a book") + transaction = Column(String, nullable=True, doc="transaction element content, but only if it couldn't be derived from format ACS-BIBL-L-{acsm_id}, otherwise Null") + purchase_utc = Column(DateTime, nullable=True, doc="acsm purchase element excluding timezone in UTC") + expiration_utc = Column(DateTime, nullable=True, doc="acsm expiration element excluding timezone in UTC") + purchase_timezone = Column(Integer, nullable=True, doc="acsm purchase element timezone offset from UTC in seconds (note that purchase is UTC)") + expiration_timezone = Column(Integer, nullable=True, doc="acsm expiration element timezone offset from UTC in seconds (note that expiration is UTC)") + obtained = Column(BigInteger, nullable=False, doc="UNIX timestamp when this borrow was obtained as acsm from http") + duration = Column(Integer, nullable=True, doc="duration in seconds that a DRM client may make the book available") + book = relationship("Book", back_populates="borrows") + def __repr__(self): + return f"Borrow(id={self.id!r}, isbn={self.isbn!r}, purchase={self.purchase_utc!r}, purchase_timezone={self.purchase_timezone!r} expiration={self.expiration_utc!r}, expiration_timezone={self.expiration_timezone!r}, obtained=mktime({localtime(self.obtained)!r}), duration={self.duration!r}, book={self.book!r})" + +logging.basicConfig(level=logging.NOTSET) +logger = logging.getLogger(argv[0]) +logger.debug("welcome to %s", argv[0]) + +starting_acsm_id = 177238 +guaranteed_large_acsm_id = 1170487 + +def update(engine, hmfan2iarts=100): + force_acsm_id = 0 + valid_acsms = 0 + only_isbn_acsms = 0 + failed_acsms = 0 + failed_acsms_not200 = 0 + failed_acsms_not200_in_a_row = 0 + with Session(engine) as session: + while True: + if force_acsm_id != 0: + acsm_id = force_acsm_id + force_acsm_id = 0 + else: + borrow = session.scalars(select(Borrow).order_by(Borrow.id.desc()).limit(1)).first() + acsm_id = starting_acsm_id + if borrow is None: + logger.info(f"oooh, it looks like this is a fresh start, db contains no borrows. I'll start with hardcoded acsm id {starting_acsm_id}") + else: + logger.info(f"continuing from latest {borrow}") + acsm_id = borrow.id+1 + r = requests.get(f"https://www.biblos.si/izposoja/prenesi/{acsm_id}.acsm", headers={"User-Agent": f"python-requests/{requests.__version__} (biblos-stat acsm scraper, contact operator: {operator_contact})"}) + r.encoding = "UTF-8" + if (r.status_code == 200): + failed_acsms_not200_in_a_row = 0 + if r.status_code != 200: + if borrow.purchase_utc > datetime.now(timezone.utc) - timedelta(hours=1): + logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id} and the last requested acsm was created less than an hour ago") + break + logger.warning(f"received http response with error code not 200 (it is {r.status_code}). if this continues for {hmfan2iarts-failed_acsms_not200_in_a_row} more requests, I'll assume there are no more borrows on the server.") + failed_acsms_not200 += 1 + failed_acsms_not200_in_a_row += 1 + force_acsm_id = acsm_id+1 + if failed_acsms_not200_in_a_row == hmfan2iarts: + logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id}, which means {hmfan2iarts} concurrent responses that are not 200.") + if acsm_id < guaranteed_large_acsm_id: + logger.error(f"this shouldn't happen. I have a hardcoded value that tells me that at time of program writing, acsm id {guaranteed_large_acsm_id} did exist on the server. dying anyways.") + break + elif r.text.startswith("Napaka pri prenosu"): + logger.warning(f"'napaka pri prenosu' received from http for acsm id {acsm_id}, skipping") + force_acsm_id = acsm_id+1 + elif r.text.startswith(''): + logger.warning(f"received urllink parameter syntax error with no usable data for acsm {acsm_id}, so I did not store anything") + force_acsm_id = acsm_id+1 + if acsm_id >= 199999 and acsm_id <= 999999: + logger.warning(f"on 2022-11-07, library removed access for acsms 200000-999999. skipping to 1000000") + force_acsm_id = 1000000 + failed_acsms += 1 + else: + try: + acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8") + except FeatureNotFound: + raise FeatureNotFound("pip3 install lxml") + ft = acsm.fulfillmentToken + transaction = None + expected = f"ACS-BIBL-L-{acsm_id}" + if ft.transaction.string != expected: + transaction = ft.transaction.string + logger.info(f"expected {expected} in transaction.string, but instead received {ft.transaction.string} in acsm {acsm_id}") + isbn = int(ft.resourceItemInfo.resource.string.split("-").pop())+int(9e12) + identifier_is_isbn = True + identifier_to_isbn = 0 + identifier = "noidentifier" + try: + identifier = ft.resourceItemInfo.metadata.identifier.string + identifier_to_isbn = int(identifier.split(":").pop().replace("-", "")) + except (ValueError, AttributeError): + identifier_is_isbn = False + if identifier_to_isbn == 0: + identifier_is_isbn = False + expected = ft.resourceItemInfo.resource.string + if ft.licenseToken.resource.string != expected: + raise ValueError(f"expected {expected} in ft.resourceItemInfo.licenseToken.resource.string but instead received {ft.resourceItemInfo.licenseToken.resource.string} in acsm {acsm_id}") + uuid = expected.split(":").pop() + expected = f"https://cs.alliance.inkbook.eu/books/{uuid}." + try: + if ft.resourceItemInfo.metadata.thumbnailURL.string.startswith(expected) != True: + raise ValueError(f"expected {expected} in ft.resourceItemInfo.metadata.thumbnailURL.string but instead received {ft.resourceItemInfo.metadata.thumbnailURL.string} in acsm {acsm_id}") + thumbnail_extension = ft.resourceItemInfo.metadata.thumbnailURL.string.split(".").pop() + except AttributeError: + thumbnail_extension = None + if ft.resourceItemInfo.metadata.thumbnailURL != None: + raise ValueError(f"thumbnailURL actually exists, but it failed to be parsed in acsm {acsm_id}") + duration = int(ft.resourceItemInfo.licenseToken.permissions.display.duration.string) + if duration != int(ft.resourceItemInfo.licenseToken.permissions.play.duration.string): + raise ValueError(f"expected {duration} in fr.int(resourceItemInfo.licenseToken.permissions.play.duration.string) but instead received {int(resourceItemInfo.licenseToken.permissions.play.duration.string)} in acsm {acsm_id}") + hmac = b64decode(ft.hmac.string, validate=True) + title = ft.resourceItemInfo.metadata.find(name="dc:title").string + creator = ft.resourceItemInfo.metadata.creator.string + publisher = ft.resourceItemInfo.metadata.publisher.string + language = ft.resourceItemInfo.metadata.language.string + format = ft.resourceItemInfo.metadata.format.string + purchase_utc = datetime.strptime(ft.purchase.string, "%Y-%m-%dT%H:%M:%S%z") + expiration_utc = datetime.strptime(ft.expiration.string, "%Y-%m-%dT%H:%M:%S%z") + purchase_timezone = purchase_utc.tzinfo.utcoffset(None).seconds + expiration_timezone = expiration_utc.tzinfo.utcoffset(None).seconds + purchase_utc = purchase_utc.astimezone(timezone.utc).replace(tzinfo=None) + expiration_utc = expiration_utc.astimezone(timezone.utc).replace(tzinfo=None) + if identifier_is_isbn: + identifier = None + book = session.get(Book, isbn) + if book == None: + book = Book(identifier=identifier, isbn=isbn, title=title, creator=creator, publisher=publisher, thumbnail_extension=thumbnail_extension, language=language, format=format) + else: + book.identifier = identifier + book.isbn = isbn + book.title = title + book.creator = creator + book.publisher = publisher + book.thumbnail_extension = thumbnail_extension + book.language = language + book.format = format + borrow = Borrow(id=acsm_id, isbn=isbn, purchase_utc=purchase_utc, expiration_utc=expiration_utc, obtained=int(time()), book=book, transaction=transaction, purchase_timezone=purchase_timezone, expiration_timezone=expiration_timezone, duration=duration) + logger.info(f"found a new {borrow!r}") + session.add(borrow) + session.commit() + valid_acsms += 1 + logger.info(f"In this update, {valid_acsms} valid acsms were stored, {only_isbn_acsms} acsms had only isbn and no other data available and {failed_acsms} acsms failed to be received with response code 200 and {failed_acsms_not200} acsms failed to be received but did not return 200. Last valid requested acsm was {acsm_id}. Thank you for cooperation.") + return {"valid_acsms": valid_acsms, "only_isbn_acsms": only_isbn_acsms, "failed_acsms": failed_acsms, "failed_acsms_not200": failed_acsms_not200, "acsm_id": acsm_id} + +if __name__ == "__main__": + if len(argv) != 1+2: + raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db, 2nd argument is operator contact that's sent via http in user-agent, for example mailto:email@address") + engine = create_engine(argv[1], echo=True, future=True) + Base.metadata.create_all(engine) + logger.debug(f"created metadata.") + try: + r = update(engine) + except KeyboardInterrupt: + logger.warning(f"Keyboard interrupt. Exiting. I hope this terminated cleanly. Last requested acsm was discarded.") -- cgit v1.2.3