2020-12-15 20:34:37 +00:00
#!/usr/bin/python
###This Program is used to rename image files with its content
2020-09-10 07:55:00 +00:00
2020-12-15 20:34:37 +00:00
##SETUP
2020-09-09 18:06:33 +00:00
try :
2020-12-15 20:34:37 +00:00
from PIL import Image
2020-09-09 18:06:33 +00:00
except ImportError :
2020-12-15 20:34:37 +00:00
import Image
2020-09-09 18:06:33 +00:00
import pytesseract
import os
import string
2020-12-15 20:34:37 +00:00
import hashlib
2020-09-09 18:06:33 +00:00
namearray = [ ]
indexnow = 0
2020-12-15 20:34:37 +00:00
print ( " OS name: " , os . name )
if os . name == " nt " : #U are using windows?
pytesseract . pytesseract . tesseract_cmd = r " C: \ Program Files (x86) \ Tesseract-OCR \ tesseract " #absolute path to out tesseract instalation
PATHSEPARATOR = " \\ "
else : #No? NICE
pytesseract . pytesseract . tesseract_cmd = r " /usr/bin/tesseract " #absolute path to teseract instalation
PATHSEPARATOR = " / "
2020-09-09 18:06:33 +00:00
2020-09-10 07:55:00 +00:00
# Define the input and output directories
2020-12-15 20:34:37 +00:00
BASEPATHINPUT = " . " + PATHSEPARATOR + " ImagesToConvert "
print ( " BASEPATHINPUT = " + BASEPATHINPUT )
BASEPATHOUTPUT = " . " + PATHSEPARATOR + " ImagesOutput "
print ( " BASEPATHOUTPUT = " + BASEPATHOUTPUT )
2020-09-10 07:55:00 +00:00
# Getting the absolute path to the before named directories
2020-12-15 20:34:37 +00:00
ABSOLUTEPATHINPUT = os . path . abspath ( BASEPATHINPUT )
print ( " ABSOLUTEPATHINPUT = " + ABSOLUTEPATHINPUT )
ABSOLUTEPATHOUTPUT = os . path . abspath ( BASEPATHOUTPUT )
print ( " ABSOLUTEPATHOUTPUT = " + ABSOLUTEPATHOUTPUT )
##FUNCTIONS
def ocr ( filename ) :
newname = pytesseract . image_to_string ( Image . open ( ABSOLUTEPATHINPUT + PATHSEPARATOR + filename ) ) # Use pillow to open an image for pytesseract image2string to use
return newname
def getHashedOut ( filename ) :
return hashlib . md5 ( Image . open ( ABSOLUTEPATHINPUT + PATHSEPARATOR + filename ) . tobytes ( ) )
def namecleaner ( filename ) :
print ( " filenameis " + filename )
filename = " - " . join ( filename . split ( ) ) # Get rid of line braks and spaces
filename = filename . replace ( " " , " " ) # Get rid of spaces(For recursion)
filename = filename . replace ( " \n " , " " ) # Get rid of line braks(For recursion)
#Get rid of forbiden characters(NTFS)
filename = filename . replace ( " < " , " " )
filename = filename . replace ( " > " , " " )
filename = filename . replace ( " : " , " " )
filename = filename . replace ( " \" " , " " )
filename = filename . replace ( " / " , " " )
filename = filename . replace ( " \\ " , " " )
filename = filename . replace ( " | " , " " )
filename = filename . replace ( " ? " , " " )
filename = filename . replace ( " * " , " " )
##################################
filename = filename . replace ( " . " , " " ) # Get rid of extra dots
while filename . count ( " -- " ) != 0 :
filename = filename . replace ( " -- " , " - " ) # Cleaning duplicated '-'s
print ( " outputnameis " + filename )
return filename
2020-09-10 07:55:00 +00:00
2020-12-15 20:34:37 +00:00
##CORECODE
2020-09-10 07:55:00 +00:00
# List all files in a directory using scandir()
2020-12-15 20:34:37 +00:00
with os . scandir ( ABSOLUTEPATHINPUT ) as entries :
for entry in entries :
if entry . is_file ( ) :
# Fill an array with the list
namearray . append ( entry . name )
2020-09-09 18:06:33 +00:00
2020-09-10 07:55:00 +00:00
# Get array length for the loop
2020-09-09 18:06:33 +00:00
arraylength = len ( namearray )
print ( " arraylength = " + str ( arraylength ) )
while indexnow < arraylength :
2020-12-15 20:34:37 +00:00
# Get file name from the array
basename = namearray [ indexnow ]
print ( " basename = " + basename )
# Call ocr
newname = ocr ( basename )
print ( " newname = " + newname )
# Call "namecleaner" to get rid of forbiden characters, line breaks and spaces.
cleanname = namecleaner ( newname )
print ( " cleanname = " + cleanname )
if cleanname != " " :
if len ( cleanname ) > 200 :
cleanname = cleanname [ 0 : 200 ]
cleanname = cleanname + " . " + basename . split ( " . " ) [ - 1 ]
os . rename ( ABSOLUTEPATHINPUT + PATHSEPARATOR + basename , ABSOLUTEPATHOUTPUT + PATHSEPARATOR + cleanname )
print ( basename + " is now renamed as " + cleanname )
else :
imgHash = getHashedOut ( basename ) . hexdigest ( )
cleanname = imgHash + " . " + basename . split ( " . " ) [ - 1 ]
os . rename ( ABSOLUTEPATHINPUT + PATHSEPARATOR + basename , ABSOLUTEPATHOUTPUT + PATHSEPARATOR + cleanname )
print ( basename + " is now " + cleanname )
2020-11-01 12:36:11 +00:00
2020-12-15 20:34:37 +00:00
indexnow + = 1
2020-09-09 18:06:33 +00:00
print ( " All images are given a name " )