better source for pytesseract will result in more reliable ocr

2025-01-23 19:36:56 -08:00 · 2024-01-17 16:12:19 +13:00 · 2024-01-17 16:12:19 +13:00 · ea224e291b
commit ea224e291b
parent 321731a5f0
1 changed files with 17 additions and 1 deletions
--- a/app/activitypub/util.py
+++ b/app/activitypub/util.py
@ -554,7 +554,7 @@ def make_image_sizes_async(file_id, thumbnail_width, medium_width, directory):

                    # Alert regarding fascist meme content
                    try:
-                        image_text = pytesseract.image_to_string(Image.open(final_place).convert('L'))
+                        image_text = pytesseract.image_to_string(Image.open(BytesIO(source_image)).convert('L'))
                    except FileNotFoundError as e:
                        image_text = ''
                    if 'Anonymous' in image_text and ('No.' in image_text or ' N0' in image_text):   # chan posts usually contain the text 'Anonymous' and ' No.12345'
@ -1164,6 +1164,22 @@ def undo_vote(activity_log, comment, post, target_ap_id, user):
    return post


+# given an activitypub id for a post or comment, retrieve it and all it's parent objects
+def backfill_from_ap_id(ap_id: str):
+    if ap_id.startswith(f"https://{current_app.config['SERVER_NAME']}"):
+        ...
+    else:
+        try:
+            activity_data = get_request(ap_id, headers={'Accept': 'application/activity+json'})
+        except requests.exceptions.ReadTimeout:
+            time.sleep(randint(3, 10))
+            activity_data = get_request(ap_id, headers={'Accept': 'application/activity+json'})
+        if activity_data.status_code == 200:
+            actor_json = activity_data.json()
+            activity_data.close()
+            return actor_json_to_model(actor_json, address, server)
+
+
 def lemmy_site_data():
    site = g.site
    data = {