diff --git a/Imagenamer.py b/Imagenamer.py old mode 100644 new mode 100755 index 8945aac..76d61b6 --- a/Imagenamer.py +++ b/Imagenamer.py @@ -1,94 +1,104 @@ -"""This Program is used to rename image files with its content""" +#!/usr/bin/python +###This Program is used to rename image files with its content -"""SETUP""" +##SETUP try: - from PIL import Image + from PIL import Image except ImportError: - import Image + import Image import pytesseract import os import string -import uuid +import hashlib namearray = [] indexnow = 0 -pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' #absolute path to out tesseract instalation +print("OS name:", os.name) +if os.name == "nt": #U are using windows? + pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract" #absolute path to out tesseract instalation + PATHSEPARATOR = "\\" +else: #No? NICE + pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" #absolute path to teseract instalation + PATHSEPARATOR = "/" -"""FUNCTIONS""" -def ocr(filename): - newname = pytesseract.image_to_string(Image.open(absolutebasepathin+"\\"+filename)) # Use pillow to open an image for pytesseract image2string to use - return newname - -def namecleaner(filename): - print("filenameis "+filename) - filename = "_".join(filename.split()) # Get rig of line braks and spaces - filename = filename.replace("__","_") # Cleaning duplicated '_'s - filename = filename.replace(" ","") # Get rig of spaces(For recursion) - filename = filename.replace("\n","") # Get rig of line braks(For recursion) - #Get rid of forbiden characters - filename = filename.replace("<","") - filename = filename.replace(">","") - filename = filename.replace(":","") - filename = filename.replace("\"","") - filename = filename.replace("/","") - filename = filename.replace("\\","") - filename = filename.replace("|","") - filename = filename.replace("?","") - filename = filename.replace("*","") - ################################## - outputname = filename.replace(".","") # Get rid of extra dots - print("outputnameis "+outputname) - return outputname - -"""CORECODE""" # Define the input and output directories -basepathin = '.\ImagesToConvert' -print("basepathin = "+basepathin) -basepathout = '.\ImageOutput' -print("basepathout = "+basepathout) +BASEPATHINPUT = "."+PATHSEPARATOR+"ImagesToConvert" +print("BASEPATHINPUT = "+BASEPATHINPUT) +BASEPATHOUTPUT = "."+PATHSEPARATOR+"ImagesOutput" +print("BASEPATHOUTPUT = "+BASEPATHOUTPUT) # Getting the absolute path to the before named directories -absolutebasepathin = os.path.abspath('.\ImagesToConvert') -print("absolutebasepathin = "+absolutebasepathin) -absolutebasepathout = os.path.abspath('.\ImagesOutput') -print("absolutebasepathout = "+absolutebasepathout) +ABSOLUTEPATHINPUT = os.path.abspath(BASEPATHINPUT) +print("ABSOLUTEPATHINPUT = "+ABSOLUTEPATHINPUT) +ABSOLUTEPATHOUTPUT = os.path.abspath(BASEPATHOUTPUT) +print("ABSOLUTEPATHOUTPUT = "+ABSOLUTEPATHOUTPUT) +##FUNCTIONS +def ocr(filename): + newname = pytesseract.image_to_string(Image.open(ABSOLUTEPATHINPUT+PATHSEPARATOR+filename)) # Use pillow to open an image for pytesseract image2string to use + return newname + +def getHashedOut(filename): + return hashlib.md5(Image.open(ABSOLUTEPATHINPUT+PATHSEPARATOR+filename).tobytes()) + +def namecleaner(filename): + print("filenameis "+filename) + filename = "-".join(filename.split()) # Get rid of line braks and spaces + filename = filename.replace(" ","") # Get rid of spaces(For recursion) + filename = filename.replace("\n","") # Get rid of line braks(For recursion) + #Get rid of forbiden characters(NTFS) + filename = filename.replace("<","") + filename = filename.replace(">","") + filename = filename.replace(":","") + filename = filename.replace("\"","") + filename = filename.replace("/","") + filename = filename.replace("\\","") + filename = filename.replace("|","") + filename = filename.replace("?","") + filename = filename.replace("*","") + ################################## + filename = filename.replace(".","") # Get rid of extra dots + while filename.count("--") != 0: + filename = filename.replace("--","-") # Cleaning duplicated '-'s + print("outputnameis "+filename) + return filename + +##CORECODE # List all files in a directory using scandir() -with os.scandir(absolutebasepathin) as entries: - for entry in entries: - if entry.is_file(): - # Fill an array with the list - namearray.append(entry.name) +with os.scandir(ABSOLUTEPATHINPUT) as entries: + for entry in entries: + if entry.is_file(): + # Fill an array with the list + namearray.append(entry.name) # Get array length for the loop arraylength = len(namearray) print("arraylength = "+str(arraylength)) while indexnow < arraylength: - # Get file name from the array - basename = namearray[indexnow] - print("basename = "+basename) - # Call ocr - newname = ocr(basename) - print("newname = "+newname) - # Call "namecleaner" to get rid of forbiden characters, line breaks and spaces. - cleanname = namecleaner(newname) - print("cleanname = "+cleanname) - if cleanname != "": - if len(cleanname) > 250: - cleanname = cleanname[0:250] - cleanname = cleanname + basename[len(basename)-4:len(basename)] - os.rename(absolutebasepathin+"\\"+basename, absolutebasepathout+"\\"+cleanname) - print(basename+" is now renamed as "+cleanname) - else: - UUIDnow = str(uuid.uuid4()) - cleanname = namecleaner(UUIDnow) - cleanname = cleanname + basename[len(basename)-4:len(basename)] - os.rename(absolutebasepathin+"\\"+basename, absolutebasepathout+"\\"+cleanname) - print(basename+" is now "+cleanname) + # Get file name from the array + basename = namearray[indexnow] + print("basename = "+basename) + # Call ocr + newname = ocr(basename) + print("newname = "+newname) + # Call "namecleaner" to get rid of forbiden characters, line breaks and spaces. + cleanname = namecleaner(newname) + print("cleanname = "+cleanname) + if cleanname != "": + if len(cleanname) > 200: + cleanname = cleanname[0:200] + cleanname = cleanname + "." + basename.split(".")[-1] + os.rename(ABSOLUTEPATHINPUT+PATHSEPARATOR+basename, ABSOLUTEPATHOUTPUT+PATHSEPARATOR+cleanname) + print(basename+" is now renamed as "+cleanname) + else: + imgHash = getHashedOut(basename).hexdigest() + cleanname = imgHash + "." + basename.split(".")[-1] + os.rename(ABSOLUTEPATHINPUT+PATHSEPARATOR+basename, ABSOLUTEPATHOUTPUT+PATHSEPARATOR+cleanname) + print(basename+" is now "+cleanname) - indexnow += 1 + indexnow += 1 print("All images are given a name") diff --git a/Setup.bat b/Setup.bat index 4b03cbb..19309be 100644 --- a/Setup.bat +++ b/Setup.bat @@ -6,6 +6,7 @@ echo install python: https://www.python.org/downloads/ start https://www.python.org/downloads/ echo also install tesseract https://tesseract-ocr.github.io/tessdoc/Downloads.html +echo recomended the UB Mannheim https://github.com/UB-Mannheim/tesseract/wiki start https://tesseract-ocr.github.io/tessdoc/Downloads.html echo wait untill instalation @@ -15,3 +16,6 @@ md ImagesToConvert md ImagesOutput pip install Pillow pip install pytesseract +pip install hashlib + +echo Instalation finished diff --git a/setup.sh b/setup.sh index ff79197..c65b5ca 100755 --- a/setup.sh +++ b/setup.sh @@ -14,5 +14,8 @@ mkdir ImagesToConvert mkdir ImagesOutput pip install Pillow pip install pytesseract +pip install hashlib + +sudo cp eng.traineddata /usr/share/tessdata/ echo "Installation finished!"