diff options
Diffstat (limited to 'gather.py')
-rwxr-xr-x | gather.py | 27 |
1 files changed, 20 insertions, 7 deletions
@@ -14,9 +14,10 @@ try: except ModuleNotFoundError: raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4") -if len(argv) != 1+1: - raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db") +if len(argv) != 1+2: + raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db, 2nd argument is operator contact that's sent via http in user-agent, for example mailto:email@address") +operator_contact = argv[2] engine = create_engine(argv[1], echo=True, future=True) Base = declarative_base() @@ -53,6 +54,7 @@ logger.debug("welcome to %s", argv[0]) Base.metadata.create_all(engine) starting_acsm_id = 177238 +guaranteed_large_acsm_id = 1170487 logger.debug(f"created metadata.") force_acsm_id = 0 @@ -60,6 +62,8 @@ force_acsm_id = 0 valid_acsms = 0 only_isbn_acsms = 0 failed_acsms = 0 +failed_acsms_not200 = 0 +failed_acsms_not200_in_a_row = 0 try: with Session(engine) as session: @@ -75,10 +79,19 @@ try: else: logger.info(f"continuing from latest {borrow}") acsm_id = borrow.id+1 - r = requests.get(f"https://www.biblos.si/izposoja/prenesi/{acsm_id}.acsm") + r = requests.get(f"https://www.biblos.si/izposoja/prenesi/{acsm_id}.acsm", headers={"User-Agent": f"python-requests/{requests.__version__} (biblos-stat acsm scraper, contact operator: {operator_contact})"}) + if (r.status_code == 200): + failed_acsms_not200_in_a_row = 0 if r.status_code != 200: - logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id}. latest borrow is {borrow}") - break + logger.warning(f"received http response with error code not 200 (it is {r.status_code}). if this continues for {10-failed_acsms_not200_in_a_row} more requests, I'll assume there are no more borrows on the server.") + failed_acsms_not200 += 1 + failed_acsms_not200_in_a_row += 1 + force_acsm_id = acsm_id+1 + if failed_acsms_not200_in_a_row == 10: + logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id}, which means 10 concurrent responses that are not 200.") + if acsm_id < guaranteed_large_acsm_id: + logger.error(f"this shouldn't happen. I have a hardcoded value that tells me that at time of program writing, acsm id {guaranteed_large_acsm_id} did exist on the server. dying anyways.") + break elif r.text.startswith('<error xmlns="http://ns.adobe.com/adept" data="E_URLLINK_NO_SUCH_RESOURCE resid urn:uuid:00000000-1002-0000-0009-78'): isbn = int(r.text.split()[4].split("-").pop())+int(9e12) borrow = Borrow(id=acsm_id, isbn=isbn, obtained=int(time())) @@ -91,7 +104,7 @@ try: force_acsm_id = acsm_id+1 failed_acsms += 1 else: - acsm = BeautifulSoup(r.text, "xml") + acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8") ft = acsm.fulfillmentToken expected = f"ACS-BIBL-L-{acsm_id}" if ft.transaction.string != expected: @@ -148,7 +161,7 @@ try: except KeyboardInterrupt: logger.warning(f"Keyboard interrupt. Exiting. I hope this terminated cleanly. Last requested acsm was discarded.") -logger.info(f"In this session, {valid_acsms} valid acsms were stored, {only_isbn_acsms} acsms had only isbn and no other data available and {failed_acsms} acsms failed to be received. Last valid requested acsm was {acsm_id}. Thank you for cooperation.") +logger.info(f"In this session, {valid_acsms} valid acsms were stored, {only_isbn_acsms} acsms had only isbn and no other data available and {failed_acsms} acsms failed to be received with response code 200 and {failed_acsms_not200} acsms failed to be received but did not return 200. Last valid requested acsm was {acsm_id}. Thank you for cooperation.") """ metadata = MetaData() |