Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 0 additions & 194 deletions fakemenot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,194 +0,0 @@
'''
This program uses pytesseract to do OCR on images.
Make sure you have the following packages installed on the system :

* tesseract
* tesseract-data-eng

Tested on Arch Linux - Rolling
'''

import argparse
import os
import sys
import pytesseract
import configparser
from TwitterSearch import *
from PIL import Image, ImageEnhance, ImageFilter
from termcolor import colored


parser = argparse.ArgumentParser(description='Process images')
parser.add_argument(
'--image',
'-i',
help='Twitter screenshot image',
required=True)
parser.add_argument('--limit', '-l', help='Limit tweets pulled', default=250)
parser.add_argument(
'--config',
'-c',
help='Path to twitter config (default: ~/.fakemenot.config)',
default="~/.fakemenot.config")

args = parser.parse_args()


def get_config():
config = configparser.RawConfigParser()
try:
with open(os.path.expanduser(args.config)) as config_file:
config.readfp(config_file)
except IOError as ioe:
print(colored("Couldn't open the config file {} because {}".format(
args.config, ioe), 'red'))
sys.exit(2)
return config


def _do_ocr_and_lookup(img_obj):
config = get_config()
limit_of_tweets = int(args.limit)
potential_user = '__fakemenot__'
# Replace line breaks with a space and split text into an array
text = pytesseract.image_to_string(
img_obj, lang='eng').replace(
'\n', ' ').split(' ')
for element in text:
if element and element[0] == '@':
print("Detected handle : " + str(element))
# Since handles cannot have spaces, strip until space
potential_user = element.split(' ')[0]
break

# Just in case the person Yousing the program puts in ' or " in the config.
consumer_key = config.get(
'twitter',
'consumer_key').replace(
'\'','').replace(
'\"','')
consumer_secret = config.get(
'twitter',
'consumer_secret').replace(
'\'','').replace(
'\"','')
access_token = config.get(
'twitter',
'access_token').replace(
'\'','').replace(
'\"','')
access_token_secret = config.get(
'twitter',
'access_token_secret').replace(
'\'','').replace(
'\"','')

if potential_user == '__fakemenot__':
print(colored("[*] It looks like OCR failed. Please make sure you " +
"crop the image as in sample and is readable.", 'red'))
exit(1)

try:
tuo = TwitterUserOrder(potential_user[1:])
ts = TwitterSearch(
consumer_key=consumer_key,
consumer_secret=consumer_secret,
access_token=access_token,
access_token_secret=access_token_secret
)
tweets = []
body = '__awesomebody__'
for tweet in ts.search_tweets_iterable(tuo):
# Nobody cares about re-tweets
if 'RT ' not in tweet['text']:
if tweet not in tweets:
tweets.append((tweet['text'], tweet['id']))
if not limit_of_tweets:
break
else:
limit_of_tweets -= 1

# The most probable tweet body is this.
try:
body = text[text.index('V') + 1:]
except ValueError:
body = text

# If none of that was found, let's report an OCR error
if body == '__awesomebody__':
print(colored("[*] It looks like OCR failed.Please make sure you " +
"crop image as in sample and is readable.", 'red'))

found_tweet = False
# Check against every tweet pulled
for tweet in tweets:
removed_elements = 0
ltweet, orig_len = tweet[0].split(' '), len(tweet[0].split(' '))
# Compare each element of body to element in body. TODO: Optimize
for ele in body:

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Body is being passed as a string here. So, every iteration is going to produce a single character. It's best to body.split(' ') before this :)

@Double-A-92 Double-A-92 Oct 15, 2017

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didnt touch that part. Just literally ripped out the OCR bits, e.g. the parts that set the variables potential_user and body.

That whole analysis part can probably be replaced by the SequenceMatcher.ratio() function that I also used in the unit tests. Seems to do the same thing?

if ele in ltweet:
removed_elements += 1
ltweet.remove(ele)
removal_rate = (removed_elements / float(orig_len)) * 100

if int(removal_rate) > 75:
found_tweet = True
print(colored("[*] It looks like this is a valid tweet",
'green'))
print(colored("-> Confidence : " + "%.2f" % removal_rate + "%",
'green'))
print(colored("-> Potential URL : https://twitter.com/" +
potential_user[1:] +
"/status/" + str(tweet[1]), 'green'))

elif int(removal_rate) in (55, 75):
found_tweet = True
print(colored("[*] This might be a valid tweet", 'yellow'))
print(colored("-> Confidence : " + "%.2f" % removal_rate + "%",
'yellow'))
print(colored("-> Potential URL : https://twitter.com/" +
potential_user[1:] +
"/status/" + str(tweet[1]), 'yellow'))

if not found_tweet:
print(colored("[*] I couldn't find a tweet like that. " +
"Try increasing the limit to pull more tweets",
'yellow'))

except TwitterSearchException as e: # catch all those ugly errors
print(e)


def _blow_up_image():
try:
img = Image.open(args.image)
except (OSError, IOError):
print(colored("[!] I couldn't find a file by that name. Fake you!",
'red'))
return False

basewidth = 2500
img = Image.open(args.image)
wpercent = (basewidth / float(img.size[0]))
hsize = int((float(img.size[1]) * float(wpercent)))
# Resize happens here
img = img.resize((basewidth, hsize), Image.ANTIALIAS)

# Thanks Stack Overflow <3 : https://stackoverflow.com/a/37750605/5486120
img = img.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(2)

# Return the sexy image object
return img


def main():
img_obj = _blow_up_image()
if img_obj:
# Give that sexy image object to OCR to find potential user
_do_ocr_and_lookup(img_obj)


if __name__ == '__main__':
main()
139 changes: 139 additions & 0 deletions fakemenot/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import argparse
import os
import sys
import configparser
from TwitterSearch import *
from termcolor import colored

from fakemenot.ocr import find_user_and_text_in_tweet_image

parser = argparse.ArgumentParser(description='Process images')
parser.add_argument(
'--image',
'-i',
help='Twitter screenshot image',
required=True)
parser.add_argument('--limit', '-l', help='Limit tweets pulled', default=250)
parser.add_argument(
'--config',
'-c',
help='Path to twitter config (default: ~/.fakemenot.config)',
default="~/.fakemenot.config")

args = parser.parse_args()


def get_config():
config = configparser.RawConfigParser()
try:
with open(os.path.expanduser(args.config)) as config_file:
config.readfp(config_file)
except IOError as ioe:
print(colored("Couldn't open the config file {} because {}".format(
args.config, ioe), 'red'))
sys.exit(2)
return config


def _do_lookup(potential_user, body):
config = get_config()
limit_of_tweets = int(args.limit)

# Just in case the person using the program puts in ' or " in the config.
consumer_key = config.get(
'twitter',
'consumer_key').replace(
'\'', '').replace(
'\"', '')
consumer_secret = config.get(
'twitter',
'consumer_secret').replace(
'\'', '').replace(
'\"', '')
access_token = config.get(
'twitter',
'access_token').replace(
'\'', '').replace(
'\"', '')
access_token_secret = config.get(
'twitter',
'access_token_secret').replace(
'\'', '').replace(
'\"', '')

if potential_user is None:
print(colored("[*] It looks like OCR failed. Please make sure you " +
"crop the image as in sample and is readable.", 'red'))
exit(1)

try:
tuo = TwitterUserOrder(potential_user[1:])
ts = TwitterSearch(
consumer_key=consumer_key,
consumer_secret=consumer_secret,
access_token=access_token,
access_token_secret=access_token_secret
)
tweets = []
for tweet in ts.search_tweets_iterable(tuo):
# Nobody cares about re-tweets
if 'RT ' not in tweet['text']:
if tweet not in tweets:
tweets.append((tweet['text'], tweet['id']))
if not limit_of_tweets:
break
else:
limit_of_tweets -= 1

# If none of that was found, let's report an OCR error
if body is None:
print(colored("[*] It looks like OCR failed.Please make sure you " +
"crop image as in sample and is readable.", 'red'))

found_tweet = False
# Check against every tweet pulled
for tweet in tweets:
removed_elements = 0
ltweet, orig_len = tweet[0].split(' '), len(tweet[0].split(' '))
# Compare each element of body to element in body. TODO: Optimize
for ele in body:
if ele in ltweet:
removed_elements += 1
ltweet.remove(ele)
removal_rate = (removed_elements / float(orig_len)) * 100

if int(removal_rate) > 75:
found_tweet = True
print(colored("[*] It looks like this is a valid tweet",
'green'))
print(colored("-> Confidence : " + "%.2f" % removal_rate + "%",
'green'))
print(colored("-> Potential URL : https://twitter.com/" +
potential_user[1:] +
"/status/" + str(tweet[1]), 'green'))

elif int(removal_rate) in (55, 75):
found_tweet = True
print(colored("[*] This might be a valid tweet", 'yellow'))
print(colored("-> Confidence : " + "%.2f" % removal_rate + "%",
'yellow'))
print(colored("-> Potential URL : https://twitter.com/" +
potential_user[1:] +
"/status/" + str(tweet[1]), 'yellow'))

if not found_tweet:
print(colored("[*] I couldn't find a tweet like that. " +
"Try increasing the limit to pull more tweets",
'yellow'))

except TwitterSearchException as e: # catch all those ugly errors
print(e)


def main():
user, tweet_text = find_user_and_text_in_tweet_image(args.image)
_do_lookup(user, tweet_text)


if __name__ == '__main__':
main()
59 changes: 59 additions & 0 deletions fakemenot/ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
This requires the tesseract library (with the english training data) to do OCR on images.
See https://github.com/tesseract-ocr/tesseract/wiki and https://pypi.python.org/pypi/pytesseract
for installation instructions.
"""

from PIL import Image, ImageEnhance, ImageFilter
from pytesseract import image_to_string


def find_user_and_text_in_tweet_image(image_path):
# OCR image
image = prepare_image_for_ocr(image_path)
if not image:
return None, None

# Split result into single words
text = image_to_string(image, lang='eng')
words = text.replace('\n', ' ').split(' ')

# Delegate extraction of name and body to separate function.
# Only the desktop detail view of a tweet is supported for now.
return extract_values_from_desktop_tweet(words)


def extract_values_from_desktop_tweet(words):
# Find user handle in words
user = next((w for w in words if (w and len(w) > 1 and w[0] == '@')), None)

# Find the tweet text body
body = None
if user:
# Usually there are 2 random chars after the user handle, then the body starts.
body_index = words.index(user) + 3

if len(words) > body_index:
body = " ".join(words[body_index:]).strip()

return user, body


def prepare_image_for_ocr(image_path):
# Open the image
try:
image = Image.open(image_path)
except (OSError, IOError):
return None

# Resize image
width = 4096.0
height = width / image.size[0] * image.size[1]
image = image.resize((int(width), int(height)), Image.ANTIALIAS)

# Clean image and increase contrast. See: https://stackoverflow.com/a/37750605/5486120
image = image.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2)

return image
Binary file added fakemenot/tests/res/test_ocr_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added fakemenot/tests/res/test_ocr_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added fakemenot/tests/res/test_ocr_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading