pwnfoo · Double-A-92 · Oct 15, 2017 · Oct 15, 2017 · pwnfoo · Oct 15, 2017
diff --git a/fakemenot/__init__.py b/fakemenot/__init__.py
@@ -1,194 +0,0 @@
-'''
-This program uses pytesseract to do OCR on images.
-Make sure you have the following packages installed on the system :
-
-* tesseract
-* tesseract-data-eng
-
-Tested on Arch Linux - Rolling
-'''
-
-import argparse
-import os
-import sys
-import pytesseract
-import configparser
-from TwitterSearch import *
-from PIL import Image, ImageEnhance, ImageFilter
-from termcolor import colored
-
-
-parser = argparse.ArgumentParser(description='Process images')
-parser.add_argument(
-    '--image',
-    '-i',
-    help='Twitter screenshot image',
-    required=True)
-parser.add_argument('--limit', '-l', help='Limit tweets pulled', default=250)
-parser.add_argument(
-    '--config',
-    '-c',
-    help='Path to twitter config (default: ~/.fakemenot.config)',
-    default="~/.fakemenot.config")
-
-args = parser.parse_args()
-
-
-def get_config():
-    config = configparser.RawConfigParser()
-    try:
-        with open(os.path.expanduser(args.config)) as config_file:
-            config.readfp(config_file)
-    except IOError as ioe:
-        print(colored("Couldn't open the config file {} because {}".format(
-            args.config, ioe), 'red'))
-        sys.exit(2)
-    return config
-
-
-def _do_ocr_and_lookup(img_obj):
-    config = get_config()
-    limit_of_tweets = int(args.limit)
-    potential_user = '__fakemenot__'
-    # Replace line breaks with a space and split text into an array
-    text = pytesseract.image_to_string(
-        img_obj, lang='eng').replace(
-        '\n', ' ').split(' ')
-    for element in text:
-        if element and element[0] == '@':
-            print("Detected handle : " + str(element))
-            # Since handles cannot have spaces, strip until space
-            potential_user = element.split(' ')[0]
-            break
-
-    # Just in case the person Yousing the program puts in ' or " in the config.
-    consumer_key = config.get(
-        'twitter',
-        'consumer_key').replace(
-            '\'','').replace(
-                '\"','')
-    consumer_secret = config.get(
-        'twitter',
-        'consumer_secret').replace(
-            '\'','').replace(
-                '\"','')
-    access_token = config.get(
-        'twitter',
-        'access_token').replace(
-            '\'','').replace(
-                '\"','')
-    access_token_secret = config.get(
-        'twitter',
-        'access_token_secret').replace(
-            '\'','').replace(
-                '\"','')
-
-    if potential_user == '__fakemenot__':
-        print(colored("[*] It looks like OCR failed. Please make sure you " +
-            "crop the image as in sample and is readable.", 'red'))
-        exit(1)
-
-    try:
-        tuo = TwitterUserOrder(potential_user[1:])
-        ts = TwitterSearch(
-            consumer_key=consumer_key,
-            consumer_secret=consumer_secret,
-            access_token=access_token,
-            access_token_secret=access_token_secret
-        )
-        tweets = []
-        body = '__awesomebody__'
-        for tweet in ts.search_tweets_iterable(tuo):
-            # Nobody cares about re-tweets
-            if 'RT ' not in tweet['text']:
-                if tweet not in tweets:
-                    tweets.append((tweet['text'], tweet['id']))
-                if not limit_of_tweets:
-                    break
-                else:
-                    limit_of_tweets -= 1
-
-        # The most probable tweet body is this.
-        try:
-            body = text[text.index('V') + 1:]
-        except ValueError:
-            body = text
-
-        # If none of that was found, let's report an OCR error
-        if body == '__awesomebody__':
-            print(colored("[*] It looks like OCR failed.Please make sure you " +
-                          "crop image as in sample and is readable.", 'red'))
-
-        found_tweet = False
-        # Check against every tweet pulled
-        for tweet in tweets:
-            removed_elements = 0
-            ltweet, orig_len = tweet[0].split(' '), len(tweet[0].split(' '))
-            # Compare each element of body to element in body. TODO: Optimize
-            for ele in body:
-                if ele in ltweet:
-                    removed_elements += 1
-                    ltweet.remove(ele)
-            removal_rate = (removed_elements / float(orig_len)) * 100
-
-            if int(removal_rate) > 75:
-                found_tweet = True
-                print(colored("[*] It looks like this is a valid tweet",
-                              'green'))
-                print(colored("-> Confidence : " + "%.2f" % removal_rate + "%",
-                              'green'))
-                print(colored("-> Potential URL : https://twitter.com/" +
-                              potential_user[1:] +
-                              "/status/" + str(tweet[1]), 'green'))
-
-            elif int(removal_rate) in (55, 75):
-                found_tweet = True
-                print(colored("[*] This might be a valid tweet", 'yellow'))
-                print(colored("-> Confidence : " + "%.2f" % removal_rate + "%",
-                              'yellow'))
-                print(colored("-> Potential URL : https://twitter.com/" +
-                              potential_user[1:] +
-                              "/status/" + str(tweet[1]), 'yellow'))
-
-        if not found_tweet:
-            print(colored("[*] I couldn't find a tweet like that. " +
-                          "Try increasing the limit to pull more tweets",
-                          'yellow'))
-
-    except TwitterSearchException as e:  # catch all those ugly errors
-        print(e)
-
-
-def _blow_up_image():
-    try:
-        img = Image.open(args.image)
-    except (OSError, IOError):
-        print(colored("[!] I couldn't find a file by that name. Fake you!",
-                      'red'))
-        return False
-
-    basewidth = 2500
-    img = Image.open(args.image)
-    wpercent = (basewidth / float(img.size[0]))
-    hsize = int((float(img.size[1]) * float(wpercent)))
-    # Resize happens here
-    img = img.resize((basewidth, hsize), Image.ANTIALIAS)
-
-    # Thanks Stack Overflow <3 : https://stackoverflow.com/a/37750605/5486120
-    img = img.filter(ImageFilter.MedianFilter())
-    enhancer = ImageEnhance.Contrast(img)
-    img = enhancer.enhance(2)
-
-    # Return the sexy image object
-    return img
-
-
-def main():
-    img_obj = _blow_up_image()
-    if img_obj:
-        # Give that sexy image object to OCR to find potential user
-        _do_ocr_and_lookup(img_obj)
-
-
-if __name__ == '__main__':
-    main()

diff --git a/fakemenot/main.py b/fakemenot/main.py
@@ -0,0 +1,139 @@
+import argparse
+import os
+import sys
+import configparser
+from TwitterSearch import *
+from termcolor import colored
+
+from fakemenot.ocr import find_user_and_text_in_tweet_image
+
+parser = argparse.ArgumentParser(description='Process images')
+parser.add_argument(
+    '--image',
+    '-i',
+    help='Twitter screenshot image',
+    required=True)
+parser.add_argument('--limit', '-l', help='Limit tweets pulled', default=250)
+parser.add_argument(
+    '--config',
+    '-c',
+    help='Path to twitter config (default: ~/.fakemenot.config)',
+    default="~/.fakemenot.config")
+
+args = parser.parse_args()
+
+
+def get_config():
+    config = configparser.RawConfigParser()
+    try:
+        with open(os.path.expanduser(args.config)) as config_file:
+            config.readfp(config_file)
+    except IOError as ioe:
+        print(colored("Couldn't open the config file {} because {}".format(
+            args.config, ioe), 'red'))
+        sys.exit(2)
+    return config
+
+
+def _do_lookup(potential_user, body):
+    config = get_config()
+    limit_of_tweets = int(args.limit)
+
+    # Just in case the person using the program puts in ' or " in the config.
+    consumer_key = config.get(
+        'twitter',
+        'consumer_key').replace(
+        '\'', '').replace(
+        '\"', '')
+    consumer_secret = config.get(
+        'twitter',
+        'consumer_secret').replace(
+        '\'', '').replace(
+        '\"', '')
+    access_token = config.get(
+        'twitter',
+        'access_token').replace(
+        '\'', '').replace(
+        '\"', '')
+    access_token_secret = config.get(
+        'twitter',
+        'access_token_secret').replace(
+        '\'', '').replace(
+        '\"', '')
+
+    if potential_user is None:
+        print(colored("[*] It looks like OCR failed. Please make sure you " +
+                      "crop the image as in sample and is readable.", 'red'))
+        exit(1)
+
+    try:
+        tuo = TwitterUserOrder(potential_user[1:])
+        ts = TwitterSearch(
+            consumer_key=consumer_key,
+            consumer_secret=consumer_secret,
+            access_token=access_token,
+            access_token_secret=access_token_secret
+        )
+        tweets = []
+        for tweet in ts.search_tweets_iterable(tuo):
+            # Nobody cares about re-tweets
+            if 'RT ' not in tweet['text']:
+                if tweet not in tweets:
+                    tweets.append((tweet['text'], tweet['id']))
+                if not limit_of_tweets:
+                    break
+                else:
+                    limit_of_tweets -= 1
+
+        # If none of that was found, let's report an OCR error
+        if body is None:
+            print(colored("[*] It looks like OCR failed.Please make sure you " +
+                          "crop image as in sample and is readable.", 'red'))
+
+        found_tweet = False
+        # Check against every tweet pulled
+        for tweet in tweets:
+            removed_elements = 0
+            ltweet, orig_len = tweet[0].split(' '), len(tweet[0].split(' '))
+            # Compare each element of body to element in body. TODO: Optimize
+            for ele in body:
+                if ele in ltweet:
+                    removed_elements += 1
+                    ltweet.remove(ele)
+            removal_rate = (removed_elements / float(orig_len)) * 100
+
+            if int(removal_rate) > 75:
+                found_tweet = True
+                print(colored("[*] It looks like this is a valid tweet",
+                              'green'))
+                print(colored("-> Confidence : " + "%.2f" % removal_rate + "%",
+                              'green'))
+                print(colored("-> Potential URL : https://twitter.com/" +
+                              potential_user[1:] +
+                              "/status/" + str(tweet[1]), 'green'))
+
+            elif int(removal_rate) in (55, 75):
+                found_tweet = True
+                print(colored("[*] This might be a valid tweet", 'yellow'))
+                print(colored("-> Confidence : " + "%.2f" % removal_rate + "%",
+                              'yellow'))
+                print(colored("-> Potential URL : https://twitter.com/" +
+                              potential_user[1:] +
+                              "/status/" + str(tweet[1]), 'yellow'))
+
+        if not found_tweet:
+            print(colored("[*] I couldn't find a tweet like that. " +
+                          "Try increasing the limit to pull more tweets",
+                          'yellow'))
+
+    except TwitterSearchException as e:  # catch all those ugly errors
+        print(e)
+
+
+def main():
+    user, tweet_text = find_user_and_text_in_tweet_image(args.image)
+    _do_lookup(user, tweet_text)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fakemenot/ocr.py b/fakemenot/ocr.py
@@ -0,0 +1,59 @@
+"""
+This requires the tesseract library (with the english training data) to do OCR on images.
+See https://github.com/tesseract-ocr/tesseract/wiki and https://pypi.python.org/pypi/pytesseract
+for installation instructions.
+"""
+
+from PIL import Image, ImageEnhance, ImageFilter
+from pytesseract import image_to_string
+
+
+def find_user_and_text_in_tweet_image(image_path):
+    # OCR image
+    image = prepare_image_for_ocr(image_path)
+    if not image:
+        return None, None
+
+    # Split result into single words
+    text = image_to_string(image, lang='eng')
+    words = text.replace('\n', ' ').split(' ')
+
+    # Delegate extraction of name and body to separate function.
+    # Only the desktop detail view of a tweet is supported for now.
+    return extract_values_from_desktop_tweet(words)
+
+
+def extract_values_from_desktop_tweet(words):
+    # Find user handle in words
+    user = next((w for w in words if (w and len(w) > 1 and w[0] == '@')), None)
+
+    # Find the tweet text body
+    body = None
+    if user:
+        # Usually there are 2 random chars after the user handle, then the body starts.
+        body_index = words.index(user) + 3
+
+        if len(words) > body_index:
+            body = " ".join(words[body_index:]).strip()
+
+    return user, body
+
+
+def prepare_image_for_ocr(image_path):
+    # Open the image
+    try:
+        image = Image.open(image_path)
+    except (OSError, IOError):
+        return None
+
+    # Resize image
+    width = 4096.0
+    height = width / image.size[0] * image.size[1]
+    image = image.resize((int(width), int(height)), Image.ANTIALIAS)
+
+    # Clean image and increase contrast. See: https://stackoverflow.com/a/37750605/5486120
+    image = image.filter(ImageFilter.MedianFilter())
+    enhancer = ImageEnhance.Contrast(image)
+    image = enhancer.enhance(2)
+
+    return image
diff --git a/fakemenot/tests/res/test_ocr_1.png b/fakemenot/tests/res/test_ocr_1.png
diff --git a/fakemenot/tests/res/test_ocr_2.png b/fakemenot/tests/res/test_ocr_2.png
diff --git a/fakemenot/tests/res/test_ocr_3.png b/fakemenot/tests/res/test_ocr_3.png