-
Notifications
You must be signed in to change notification settings - Fork 14
Extract OCR functions and add unit tests for them. #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Double-A-92
wants to merge
2
commits into
pwnfoo:master
Choose a base branch
from
Double-A-92:master
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,194 +0,0 @@ | ||
| ''' | ||
| This program uses pytesseract to do OCR on images. | ||
| Make sure you have the following packages installed on the system : | ||
|
|
||
| * tesseract | ||
| * tesseract-data-eng | ||
|
|
||
| Tested on Arch Linux - Rolling | ||
| ''' | ||
|
|
||
| import argparse | ||
| import os | ||
| import sys | ||
| import pytesseract | ||
| import configparser | ||
| from TwitterSearch import * | ||
| from PIL import Image, ImageEnhance, ImageFilter | ||
| from termcolor import colored | ||
|
|
||
|
|
||
| parser = argparse.ArgumentParser(description='Process images') | ||
| parser.add_argument( | ||
| '--image', | ||
| '-i', | ||
| help='Twitter screenshot image', | ||
| required=True) | ||
| parser.add_argument('--limit', '-l', help='Limit tweets pulled', default=250) | ||
| parser.add_argument( | ||
| '--config', | ||
| '-c', | ||
| help='Path to twitter config (default: ~/.fakemenot.config)', | ||
| default="~/.fakemenot.config") | ||
|
|
||
| args = parser.parse_args() | ||
|
|
||
|
|
||
| def get_config(): | ||
| config = configparser.RawConfigParser() | ||
| try: | ||
| with open(os.path.expanduser(args.config)) as config_file: | ||
| config.readfp(config_file) | ||
| except IOError as ioe: | ||
| print(colored("Couldn't open the config file {} because {}".format( | ||
| args.config, ioe), 'red')) | ||
| sys.exit(2) | ||
| return config | ||
|
|
||
|
|
||
| def _do_ocr_and_lookup(img_obj): | ||
| config = get_config() | ||
| limit_of_tweets = int(args.limit) | ||
| potential_user = '__fakemenot__' | ||
| # Replace line breaks with a space and split text into an array | ||
| text = pytesseract.image_to_string( | ||
| img_obj, lang='eng').replace( | ||
| '\n', ' ').split(' ') | ||
| for element in text: | ||
| if element and element[0] == '@': | ||
| print("Detected handle : " + str(element)) | ||
| # Since handles cannot have spaces, strip until space | ||
| potential_user = element.split(' ')[0] | ||
| break | ||
|
|
||
| # Just in case the person Yousing the program puts in ' or " in the config. | ||
| consumer_key = config.get( | ||
| 'twitter', | ||
| 'consumer_key').replace( | ||
| '\'','').replace( | ||
| '\"','') | ||
| consumer_secret = config.get( | ||
| 'twitter', | ||
| 'consumer_secret').replace( | ||
| '\'','').replace( | ||
| '\"','') | ||
| access_token = config.get( | ||
| 'twitter', | ||
| 'access_token').replace( | ||
| '\'','').replace( | ||
| '\"','') | ||
| access_token_secret = config.get( | ||
| 'twitter', | ||
| 'access_token_secret').replace( | ||
| '\'','').replace( | ||
| '\"','') | ||
|
|
||
| if potential_user == '__fakemenot__': | ||
| print(colored("[*] It looks like OCR failed. Please make sure you " + | ||
| "crop the image as in sample and is readable.", 'red')) | ||
| exit(1) | ||
|
|
||
| try: | ||
| tuo = TwitterUserOrder(potential_user[1:]) | ||
| ts = TwitterSearch( | ||
| consumer_key=consumer_key, | ||
| consumer_secret=consumer_secret, | ||
| access_token=access_token, | ||
| access_token_secret=access_token_secret | ||
| ) | ||
| tweets = [] | ||
| body = '__awesomebody__' | ||
| for tweet in ts.search_tweets_iterable(tuo): | ||
| # Nobody cares about re-tweets | ||
| if 'RT ' not in tweet['text']: | ||
| if tweet not in tweets: | ||
| tweets.append((tweet['text'], tweet['id'])) | ||
| if not limit_of_tweets: | ||
| break | ||
| else: | ||
| limit_of_tweets -= 1 | ||
|
|
||
| # The most probable tweet body is this. | ||
| try: | ||
| body = text[text.index('V') + 1:] | ||
| except ValueError: | ||
| body = text | ||
|
|
||
| # If none of that was found, let's report an OCR error | ||
| if body == '__awesomebody__': | ||
| print(colored("[*] It looks like OCR failed.Please make sure you " + | ||
| "crop image as in sample and is readable.", 'red')) | ||
|
|
||
| found_tweet = False | ||
| # Check against every tweet pulled | ||
| for tweet in tweets: | ||
| removed_elements = 0 | ||
| ltweet, orig_len = tweet[0].split(' '), len(tweet[0].split(' ')) | ||
| # Compare each element of body to element in body. TODO: Optimize | ||
| for ele in body: | ||
| if ele in ltweet: | ||
| removed_elements += 1 | ||
| ltweet.remove(ele) | ||
| removal_rate = (removed_elements / float(orig_len)) * 100 | ||
|
|
||
| if int(removal_rate) > 75: | ||
| found_tweet = True | ||
| print(colored("[*] It looks like this is a valid tweet", | ||
| 'green')) | ||
| print(colored("-> Confidence : " + "%.2f" % removal_rate + "%", | ||
| 'green')) | ||
| print(colored("-> Potential URL : https://twitter.com/" + | ||
| potential_user[1:] + | ||
| "/status/" + str(tweet[1]), 'green')) | ||
|
|
||
| elif int(removal_rate) in (55, 75): | ||
| found_tweet = True | ||
| print(colored("[*] This might be a valid tweet", 'yellow')) | ||
| print(colored("-> Confidence : " + "%.2f" % removal_rate + "%", | ||
| 'yellow')) | ||
| print(colored("-> Potential URL : https://twitter.com/" + | ||
| potential_user[1:] + | ||
| "/status/" + str(tweet[1]), 'yellow')) | ||
|
|
||
| if not found_tweet: | ||
| print(colored("[*] I couldn't find a tweet like that. " + | ||
| "Try increasing the limit to pull more tweets", | ||
| 'yellow')) | ||
|
|
||
| except TwitterSearchException as e: # catch all those ugly errors | ||
| print(e) | ||
|
|
||
|
|
||
| def _blow_up_image(): | ||
| try: | ||
| img = Image.open(args.image) | ||
| except (OSError, IOError): | ||
| print(colored("[!] I couldn't find a file by that name. Fake you!", | ||
| 'red')) | ||
| return False | ||
|
|
||
| basewidth = 2500 | ||
| img = Image.open(args.image) | ||
| wpercent = (basewidth / float(img.size[0])) | ||
| hsize = int((float(img.size[1]) * float(wpercent))) | ||
| # Resize happens here | ||
| img = img.resize((basewidth, hsize), Image.ANTIALIAS) | ||
|
|
||
| # Thanks Stack Overflow <3 : https://stackoverflow.com/a/37750605/5486120 | ||
| img = img.filter(ImageFilter.MedianFilter()) | ||
| enhancer = ImageEnhance.Contrast(img) | ||
| img = enhancer.enhance(2) | ||
|
|
||
| # Return the sexy image object | ||
| return img | ||
|
|
||
|
|
||
| def main(): | ||
| img_obj = _blow_up_image() | ||
| if img_obj: | ||
| # Give that sexy image object to OCR to find potential user | ||
| _do_ocr_and_lookup(img_obj) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| main() | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,139 @@ | ||
| import argparse | ||
| import os | ||
| import sys | ||
| import configparser | ||
| from TwitterSearch import * | ||
| from termcolor import colored | ||
|
|
||
| from fakemenot.ocr import find_user_and_text_in_tweet_image | ||
|
|
||
| parser = argparse.ArgumentParser(description='Process images') | ||
| parser.add_argument( | ||
| '--image', | ||
| '-i', | ||
| help='Twitter screenshot image', | ||
| required=True) | ||
| parser.add_argument('--limit', '-l', help='Limit tweets pulled', default=250) | ||
| parser.add_argument( | ||
| '--config', | ||
| '-c', | ||
| help='Path to twitter config (default: ~/.fakemenot.config)', | ||
| default="~/.fakemenot.config") | ||
|
|
||
| args = parser.parse_args() | ||
|
|
||
|
|
||
| def get_config(): | ||
| config = configparser.RawConfigParser() | ||
| try: | ||
| with open(os.path.expanduser(args.config)) as config_file: | ||
| config.readfp(config_file) | ||
| except IOError as ioe: | ||
| print(colored("Couldn't open the config file {} because {}".format( | ||
| args.config, ioe), 'red')) | ||
| sys.exit(2) | ||
| return config | ||
|
|
||
|
|
||
| def _do_lookup(potential_user, body): | ||
| config = get_config() | ||
| limit_of_tweets = int(args.limit) | ||
|
|
||
| # Just in case the person using the program puts in ' or " in the config. | ||
| consumer_key = config.get( | ||
| 'twitter', | ||
| 'consumer_key').replace( | ||
| '\'', '').replace( | ||
| '\"', '') | ||
| consumer_secret = config.get( | ||
| 'twitter', | ||
| 'consumer_secret').replace( | ||
| '\'', '').replace( | ||
| '\"', '') | ||
| access_token = config.get( | ||
| 'twitter', | ||
| 'access_token').replace( | ||
| '\'', '').replace( | ||
| '\"', '') | ||
| access_token_secret = config.get( | ||
| 'twitter', | ||
| 'access_token_secret').replace( | ||
| '\'', '').replace( | ||
| '\"', '') | ||
|
|
||
| if potential_user is None: | ||
| print(colored("[*] It looks like OCR failed. Please make sure you " + | ||
| "crop the image as in sample and is readable.", 'red')) | ||
| exit(1) | ||
|
|
||
| try: | ||
| tuo = TwitterUserOrder(potential_user[1:]) | ||
| ts = TwitterSearch( | ||
| consumer_key=consumer_key, | ||
| consumer_secret=consumer_secret, | ||
| access_token=access_token, | ||
| access_token_secret=access_token_secret | ||
| ) | ||
| tweets = [] | ||
| for tweet in ts.search_tweets_iterable(tuo): | ||
| # Nobody cares about re-tweets | ||
| if 'RT ' not in tweet['text']: | ||
| if tweet not in tweets: | ||
| tweets.append((tweet['text'], tweet['id'])) | ||
| if not limit_of_tweets: | ||
| break | ||
| else: | ||
| limit_of_tweets -= 1 | ||
|
|
||
| # If none of that was found, let's report an OCR error | ||
| if body is None: | ||
| print(colored("[*] It looks like OCR failed.Please make sure you " + | ||
| "crop image as in sample and is readable.", 'red')) | ||
|
|
||
| found_tweet = False | ||
| # Check against every tweet pulled | ||
| for tweet in tweets: | ||
| removed_elements = 0 | ||
| ltweet, orig_len = tweet[0].split(' '), len(tweet[0].split(' ')) | ||
| # Compare each element of body to element in body. TODO: Optimize | ||
| for ele in body: | ||
| if ele in ltweet: | ||
| removed_elements += 1 | ||
| ltweet.remove(ele) | ||
| removal_rate = (removed_elements / float(orig_len)) * 100 | ||
|
|
||
| if int(removal_rate) > 75: | ||
| found_tweet = True | ||
| print(colored("[*] It looks like this is a valid tweet", | ||
| 'green')) | ||
| print(colored("-> Confidence : " + "%.2f" % removal_rate + "%", | ||
| 'green')) | ||
| print(colored("-> Potential URL : https://twitter.com/" + | ||
| potential_user[1:] + | ||
| "/status/" + str(tweet[1]), 'green')) | ||
|
|
||
| elif int(removal_rate) in (55, 75): | ||
| found_tweet = True | ||
| print(colored("[*] This might be a valid tweet", 'yellow')) | ||
| print(colored("-> Confidence : " + "%.2f" % removal_rate + "%", | ||
| 'yellow')) | ||
| print(colored("-> Potential URL : https://twitter.com/" + | ||
| potential_user[1:] + | ||
| "/status/" + str(tweet[1]), 'yellow')) | ||
|
|
||
| if not found_tweet: | ||
| print(colored("[*] I couldn't find a tweet like that. " + | ||
| "Try increasing the limit to pull more tweets", | ||
| 'yellow')) | ||
|
|
||
| except TwitterSearchException as e: # catch all those ugly errors | ||
| print(e) | ||
|
|
||
|
|
||
| def main(): | ||
| user, tweet_text = find_user_and_text_in_tweet_image(args.image) | ||
| _do_lookup(user, tweet_text) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| """ | ||
| This requires the tesseract library (with the english training data) to do OCR on images. | ||
| See https://github.com/tesseract-ocr/tesseract/wiki and https://pypi.python.org/pypi/pytesseract | ||
| for installation instructions. | ||
| """ | ||
|
|
||
| from PIL import Image, ImageEnhance, ImageFilter | ||
| from pytesseract import image_to_string | ||
|
|
||
|
|
||
| def find_user_and_text_in_tweet_image(image_path): | ||
| # OCR image | ||
| image = prepare_image_for_ocr(image_path) | ||
| if not image: | ||
| return None, None | ||
|
|
||
| # Split result into single words | ||
| text = image_to_string(image, lang='eng') | ||
| words = text.replace('\n', ' ').split(' ') | ||
|
|
||
| # Delegate extraction of name and body to separate function. | ||
| # Only the desktop detail view of a tweet is supported for now. | ||
| return extract_values_from_desktop_tweet(words) | ||
|
|
||
|
|
||
| def extract_values_from_desktop_tweet(words): | ||
| # Find user handle in words | ||
| user = next((w for w in words if (w and len(w) > 1 and w[0] == '@')), None) | ||
|
|
||
| # Find the tweet text body | ||
| body = None | ||
| if user: | ||
| # Usually there are 2 random chars after the user handle, then the body starts. | ||
| body_index = words.index(user) + 3 | ||
|
|
||
| if len(words) > body_index: | ||
| body = " ".join(words[body_index:]).strip() | ||
|
|
||
| return user, body | ||
|
|
||
|
|
||
| def prepare_image_for_ocr(image_path): | ||
| # Open the image | ||
| try: | ||
| image = Image.open(image_path) | ||
| except (OSError, IOError): | ||
| return None | ||
|
|
||
| # Resize image | ||
| width = 4096.0 | ||
| height = width / image.size[0] * image.size[1] | ||
| image = image.resize((int(width), int(height)), Image.ANTIALIAS) | ||
|
|
||
| # Clean image and increase contrast. See: https://stackoverflow.com/a/37750605/5486120 | ||
| image = image.filter(ImageFilter.MedianFilter()) | ||
| enhancer = ImageEnhance.Contrast(image) | ||
| image = enhancer.enhance(2) | ||
|
|
||
| return image |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Body is being passed as a string here. So, every iteration is going to produce a single character. It's best to
body.split(' ')before this :)Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didnt touch that part. Just literally ripped out the OCR bits, e.g. the parts that set the variables potential_user and body.
That whole analysis part can probably be replaced by the SequenceMatcher.ratio() function that I also used in the unit tests. Seems to do the same thing?