#!/usr/bin/python ###This Program is used to rename image files with its content ##SETUP try: from PIL import Image except ImportError: import Image import pytesseract import os import string import hashlib namearray = [] indexnow = 0 print("OS name:", os.name) if os.name == "nt": #U are using windows? pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract" #absolute path to out tesseract instalation PATHSEPARATOR = "\\" else: #No? NICE pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" #absolute path to teseract instalation PATHSEPARATOR = "/" # Define the input and output directories BASEPATHINPUT = "."+PATHSEPARATOR+"ImagesToConvert" print("BASEPATHINPUT = "+BASEPATHINPUT) BASEPATHOUTPUT = "."+PATHSEPARATOR+"ImagesOutput" print("BASEPATHOUTPUT = "+BASEPATHOUTPUT) # Getting the absolute path to the before named directories ABSOLUTEPATHINPUT = os.path.abspath(BASEPATHINPUT) print("ABSOLUTEPATHINPUT = "+ABSOLUTEPATHINPUT) ABSOLUTEPATHOUTPUT = os.path.abspath(BASEPATHOUTPUT) print("ABSOLUTEPATHOUTPUT = "+ABSOLUTEPATHOUTPUT) ##FUNCTIONS def ocr(filename): newname = pytesseract.image_to_string(Image.open(ABSOLUTEPATHINPUT+PATHSEPARATOR+filename)) # Use pillow to open an image for pytesseract image2string to use return newname def getHashedOut(filename): return hashlib.md5(Image.open(ABSOLUTEPATHINPUT+PATHSEPARATOR+filename).tobytes()) def namecleaner(filename): print("filenameis "+filename) filename = "-".join(filename.split()) # Get rid of line braks and spaces filename = filename.replace(" ","") # Get rid of spaces(For recursion) filename = filename.replace("\n","") # Get rid of line braks(For recursion) #Get rid of forbiden characters(NTFS) filename = filename.replace("<","") filename = filename.replace(">","") filename = filename.replace(":","") filename = filename.replace("\"","") filename = filename.replace("/","") filename = filename.replace("\\","") filename = filename.replace("|","") filename = filename.replace("?","") filename = filename.replace("*","") ################################## filename = filename.replace(".","") # Get rid of extra dots while filename.count("--") != 0: filename = filename.replace("--","-") # Cleaning duplicated '-'s print("outputnameis "+filename) return filename ##CORECODE # List all files in a directory using scandir() with os.scandir(ABSOLUTEPATHINPUT) as entries: for entry in entries: if entry.is_file(): # Fill an array with the list namearray.append(entry.name) # Get array length for the loop arraylength = len(namearray) print("arraylength = "+str(arraylength)) while indexnow < arraylength: # Get file name from the array basename = namearray[indexnow] print("basename = "+basename) # Call ocr newname = ocr(basename) print("newname = "+newname) # Call "namecleaner" to get rid of forbiden characters, line breaks and spaces. cleanname = namecleaner(newname) print("cleanname = "+cleanname) if cleanname != "": if len(cleanname) > 200: cleanname = cleanname[0:200] cleanname = cleanname + "." + basename.split(".")[-1] os.rename(ABSOLUTEPATHINPUT+PATHSEPARATOR+basename, ABSOLUTEPATHOUTPUT+PATHSEPARATOR+cleanname) print(basename+" is now renamed as "+cleanname) else: imgHash = getHashedOut(basename).hexdigest() cleanname = imgHash + "." + basename.split(".")[-1] os.rename(ABSOLUTEPATHINPUT+PATHSEPARATOR+basename, ABSOLUTEPATHOUTPUT+PATHSEPARATOR+cleanname) print(basename+" is now "+cleanname) indexnow += 1 print("All images are given a name")