diff --git a/fakemenot/__init__.py b/fakemenot/__init__.py index f56f179..e69de29 100644 --- a/fakemenot/__init__.py +++ b/fakemenot/__init__.py @@ -1,194 +0,0 @@ -''' -This program uses pytesseract to do OCR on images. -Make sure you have the following packages installed on the system : - -* tesseract -* tesseract-data-eng - -Tested on Arch Linux - Rolling -''' - -import argparse -import os -import sys -import pytesseract -import configparser -from TwitterSearch import * -from PIL import Image, ImageEnhance, ImageFilter -from termcolor import colored - - -parser = argparse.ArgumentParser(description='Process images') -parser.add_argument( - '--image', - '-i', - help='Twitter screenshot image', - required=True) -parser.add_argument('--limit', '-l', help='Limit tweets pulled', default=250) -parser.add_argument( - '--config', - '-c', - help='Path to twitter config (default: ~/.fakemenot.config)', - default="~/.fakemenot.config") - -args = parser.parse_args() - - -def get_config(): - config = configparser.RawConfigParser() - try: - with open(os.path.expanduser(args.config)) as config_file: - config.readfp(config_file) - except IOError as ioe: - print(colored("Couldn't open the config file {} because {}".format( - args.config, ioe), 'red')) - sys.exit(2) - return config - - -def _do_ocr_and_lookup(img_obj): - config = get_config() - limit_of_tweets = int(args.limit) - potential_user = '__fakemenot__' - # Replace line breaks with a space and split text into an array - text = pytesseract.image_to_string( - img_obj, lang='eng').replace( - '\n', ' ').split(' ') - for element in text: - if element and element[0] == '@': - print("Detected handle : " + str(element)) - # Since handles cannot have spaces, strip until space - potential_user = element.split(' ')[0] - break - - # Just in case the person Yousing the program puts in ' or " in the config. - consumer_key = config.get( - 'twitter', - 'consumer_key').replace( - '\'','').replace( - '\"','') - consumer_secret = config.get( - 'twitter', - 'consumer_secret').replace( - '\'','').replace( - '\"','') - access_token = config.get( - 'twitter', - 'access_token').replace( - '\'','').replace( - '\"','') - access_token_secret = config.get( - 'twitter', - 'access_token_secret').replace( - '\'','').replace( - '\"','') - - if potential_user == '__fakemenot__': - print(colored("[*] It looks like OCR failed. Please make sure you " + - "crop the image as in sample and is readable.", 'red')) - exit(1) - - try: - tuo = TwitterUserOrder(potential_user[1:]) - ts = TwitterSearch( - consumer_key=consumer_key, - consumer_secret=consumer_secret, - access_token=access_token, - access_token_secret=access_token_secret - ) - tweets = [] - body = '__awesomebody__' - for tweet in ts.search_tweets_iterable(tuo): - # Nobody cares about re-tweets - if 'RT ' not in tweet['text']: - if tweet not in tweets: - tweets.append((tweet['text'], tweet['id'])) - if not limit_of_tweets: - break - else: - limit_of_tweets -= 1 - - # The most probable tweet body is this. - try: - body = text[text.index('V') + 1:] - except ValueError: - body = text - - # If none of that was found, let's report an OCR error - if body == '__awesomebody__': - print(colored("[*] It looks like OCR failed.Please make sure you " + - "crop image as in sample and is readable.", 'red')) - - found_tweet = False - # Check against every tweet pulled - for tweet in tweets: - removed_elements = 0 - ltweet, orig_len = tweet[0].split(' '), len(tweet[0].split(' ')) - # Compare each element of body to element in body. TODO: Optimize - for ele in body: - if ele in ltweet: - removed_elements += 1 - ltweet.remove(ele) - removal_rate = (removed_elements / float(orig_len)) * 100 - - if int(removal_rate) > 75: - found_tweet = True - print(colored("[*] It looks like this is a valid tweet", - 'green')) - print(colored("-> Confidence : " + "%.2f" % removal_rate + "%", - 'green')) - print(colored("-> Potential URL : https://twitter.com/" + - potential_user[1:] + - "/status/" + str(tweet[1]), 'green')) - - elif int(removal_rate) in (55, 75): - found_tweet = True - print(colored("[*] This might be a valid tweet", 'yellow')) - print(colored("-> Confidence : " + "%.2f" % removal_rate + "%", - 'yellow')) - print(colored("-> Potential URL : https://twitter.com/" + - potential_user[1:] + - "/status/" + str(tweet[1]), 'yellow')) - - if not found_tweet: - print(colored("[*] I couldn't find a tweet like that. " + - "Try increasing the limit to pull more tweets", - 'yellow')) - - except TwitterSearchException as e: # catch all those ugly errors - print(e) - - -def _blow_up_image(): - try: - img = Image.open(args.image) - except (OSError, IOError): - print(colored("[!] I couldn't find a file by that name. Fake you!", - 'red')) - return False - - basewidth = 2500 - img = Image.open(args.image) - wpercent = (basewidth / float(img.size[0])) - hsize = int((float(img.size[1]) * float(wpercent))) - # Resize happens here - img = img.resize((basewidth, hsize), Image.ANTIALIAS) - - # Thanks Stack Overflow <3 : https://stackoverflow.com/a/37750605/5486120 - img = img.filter(ImageFilter.MedianFilter()) - enhancer = ImageEnhance.Contrast(img) - img = enhancer.enhance(2) - - # Return the sexy image object - return img - - -def main(): - img_obj = _blow_up_image() - if img_obj: - # Give that sexy image object to OCR to find potential user - _do_ocr_and_lookup(img_obj) - - -if __name__ == '__main__': - main() diff --git a/fakemenot/main.py b/fakemenot/main.py new file mode 100644 index 0000000..4c78415 --- /dev/null +++ b/fakemenot/main.py @@ -0,0 +1,139 @@ +import argparse +import os +import sys +import configparser +from TwitterSearch import * +from termcolor import colored + +from fakemenot.ocr import find_user_and_text_in_tweet_image + +parser = argparse.ArgumentParser(description='Process images') +parser.add_argument( + '--image', + '-i', + help='Twitter screenshot image', + required=True) +parser.add_argument('--limit', '-l', help='Limit tweets pulled', default=250) +parser.add_argument( + '--config', + '-c', + help='Path to twitter config (default: ~/.fakemenot.config)', + default="~/.fakemenot.config") + +args = parser.parse_args() + + +def get_config(): + config = configparser.RawConfigParser() + try: + with open(os.path.expanduser(args.config)) as config_file: + config.readfp(config_file) + except IOError as ioe: + print(colored("Couldn't open the config file {} because {}".format( + args.config, ioe), 'red')) + sys.exit(2) + return config + + +def _do_lookup(potential_user, body): + config = get_config() + limit_of_tweets = int(args.limit) + + # Just in case the person using the program puts in ' or " in the config. + consumer_key = config.get( + 'twitter', + 'consumer_key').replace( + '\'', '').replace( + '\"', '') + consumer_secret = config.get( + 'twitter', + 'consumer_secret').replace( + '\'', '').replace( + '\"', '') + access_token = config.get( + 'twitter', + 'access_token').replace( + '\'', '').replace( + '\"', '') + access_token_secret = config.get( + 'twitter', + 'access_token_secret').replace( + '\'', '').replace( + '\"', '') + + if potential_user is None: + print(colored("[*] It looks like OCR failed. Please make sure you " + + "crop the image as in sample and is readable.", 'red')) + exit(1) + + try: + tuo = TwitterUserOrder(potential_user[1:]) + ts = TwitterSearch( + consumer_key=consumer_key, + consumer_secret=consumer_secret, + access_token=access_token, + access_token_secret=access_token_secret + ) + tweets = [] + for tweet in ts.search_tweets_iterable(tuo): + # Nobody cares about re-tweets + if 'RT ' not in tweet['text']: + if tweet not in tweets: + tweets.append((tweet['text'], tweet['id'])) + if not limit_of_tweets: + break + else: + limit_of_tweets -= 1 + + # If none of that was found, let's report an OCR error + if body is None: + print(colored("[*] It looks like OCR failed.Please make sure you " + + "crop image as in sample and is readable.", 'red')) + + found_tweet = False + # Check against every tweet pulled + for tweet in tweets: + removed_elements = 0 + ltweet, orig_len = tweet[0].split(' '), len(tweet[0].split(' ')) + # Compare each element of body to element in body. TODO: Optimize + for ele in body: + if ele in ltweet: + removed_elements += 1 + ltweet.remove(ele) + removal_rate = (removed_elements / float(orig_len)) * 100 + + if int(removal_rate) > 75: + found_tweet = True + print(colored("[*] It looks like this is a valid tweet", + 'green')) + print(colored("-> Confidence : " + "%.2f" % removal_rate + "%", + 'green')) + print(colored("-> Potential URL : https://twitter.com/" + + potential_user[1:] + + "/status/" + str(tweet[1]), 'green')) + + elif int(removal_rate) in (55, 75): + found_tweet = True + print(colored("[*] This might be a valid tweet", 'yellow')) + print(colored("-> Confidence : " + "%.2f" % removal_rate + "%", + 'yellow')) + print(colored("-> Potential URL : https://twitter.com/" + + potential_user[1:] + + "/status/" + str(tweet[1]), 'yellow')) + + if not found_tweet: + print(colored("[*] I couldn't find a tweet like that. " + + "Try increasing the limit to pull more tweets", + 'yellow')) + + except TwitterSearchException as e: # catch all those ugly errors + print(e) + + +def main(): + user, tweet_text = find_user_and_text_in_tweet_image(args.image) + _do_lookup(user, tweet_text) + + +if __name__ == '__main__': + main() diff --git a/fakemenot/ocr.py b/fakemenot/ocr.py new file mode 100644 index 0000000..6477e71 --- /dev/null +++ b/fakemenot/ocr.py @@ -0,0 +1,59 @@ +""" +This requires the tesseract library (with the english training data) to do OCR on images. +See https://github.com/tesseract-ocr/tesseract/wiki and https://pypi.python.org/pypi/pytesseract +for installation instructions. +""" + +from PIL import Image, ImageEnhance, ImageFilter +from pytesseract import image_to_string + + +def find_user_and_text_in_tweet_image(image_path): + # OCR image + image = prepare_image_for_ocr(image_path) + if not image: + return None, None + + # Split result into single words + text = image_to_string(image, lang='eng') + words = text.replace('\n', ' ').split(' ') + + # Delegate extraction of name and body to separate function. + # Only the desktop detail view of a tweet is supported for now. + return extract_values_from_desktop_tweet(words) + + +def extract_values_from_desktop_tweet(words): + # Find user handle in words + user = next((w for w in words if (w and len(w) > 1 and w[0] == '@')), None) + + # Find the tweet text body + body = None + if user: + # Usually there are 2 random chars after the user handle, then the body starts. + body_index = words.index(user) + 3 + + if len(words) > body_index: + body = " ".join(words[body_index:]).strip() + + return user, body + + +def prepare_image_for_ocr(image_path): + # Open the image + try: + image = Image.open(image_path) + except (OSError, IOError): + return None + + # Resize image + width = 4096.0 + height = width / image.size[0] * image.size[1] + image = image.resize((int(width), int(height)), Image.ANTIALIAS) + + # Clean image and increase contrast. See: https://stackoverflow.com/a/37750605/5486120 + image = image.filter(ImageFilter.MedianFilter()) + enhancer = ImageEnhance.Contrast(image) + image = enhancer.enhance(2) + + return image diff --git a/fakemenot/tests/res/test_ocr_1.png b/fakemenot/tests/res/test_ocr_1.png new file mode 100644 index 0000000..389aa6b Binary files /dev/null and b/fakemenot/tests/res/test_ocr_1.png differ diff --git a/fakemenot/tests/res/test_ocr_2.png b/fakemenot/tests/res/test_ocr_2.png new file mode 100644 index 0000000..a117c70 Binary files /dev/null and b/fakemenot/tests/res/test_ocr_2.png differ diff --git a/fakemenot/tests/res/test_ocr_3.png b/fakemenot/tests/res/test_ocr_3.png new file mode 100644 index 0000000..a040d8f Binary files /dev/null and b/fakemenot/tests/res/test_ocr_3.png differ diff --git a/fakemenot/tests/test_ocr.py b/fakemenot/tests/test_ocr.py new file mode 100644 index 0000000..ed5fd3b --- /dev/null +++ b/fakemenot/tests/test_ocr.py @@ -0,0 +1,32 @@ +import unittest +from difflib import SequenceMatcher +from fakemenot.ocr import find_user_and_text_in_tweet_image + + +class OcrTestCase(unittest.TestCase): + def common_test_ocr_tweet(self, image_path, expected_user, expected_text): + user, text = find_user_and_text_in_tweet_image(image_path) + self.assertEqual(expected_user, user) + self.assertTrue(SequenceMatcher(a=expected_text, b=text).ratio() > 0.95) + + def test_ocr_tweet_1(self): + expected_user = "@mattdm" + expected_text = "Got bored so updated my #fedora 25 laptop to pre-alpha 26. Started, went for lunch, came " \ + "back to a system which Just Works -- no fuss!" + self.common_test_ocr_tweet("res/test_ocr_1.png", expected_user, expected_text) + + def test_ocr_tweet_2(self): + expected_user = "@NASA" + expected_text = "For 70 years, planes loudly flew supersonic & barriers were broken. Now we're making " \ + "history again in a quiet way: go.nasa.gov/2kOO1cc" + self.common_test_ocr_tweet("res/test_ocr_2.png", expected_user, expected_text) + + def test_ocr_tweet_3(self): + expected_user = "@wikileaks" + expected_text = "Harvard made Harry Belafonte & Ira Berlin share its highest honor in African American " \ + "Studies with Harvey Weinstein" + self.common_test_ocr_tweet("res/test_ocr_3.png", expected_user, expected_text) + + +if __name__ == '__main__': + unittest.main() diff --git a/setup.py b/setup.py index c19987a..da3a7e8 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ 'pytesseract', 'twittersearch', 'termcolor', + 'Pillow', ] here = os.path.abspath(os.path.dirname(__file__)) @@ -34,7 +35,7 @@ description=DESCRIPTION, entry_points={ 'console_scripts': [ - 'fakemenot = fakemenot.__init__:main', + 'fakemenot = fakemenot.main:main', ], }, long_description=long_description,