Added compatibility with linux, solved a bug and rewriten a bunch of stuff

This commit is contained in:
bizcochito 2020-12-15 21:34:37 +01:00
parent 7952487c2a
commit 5dfe274e58
3 changed files with 85 additions and 68 deletions

146
Imagenamer.py Normal file → Executable file
View File

@ -1,94 +1,104 @@
"""This Program is used to rename image files with its content"""
#!/usr/bin/python
###This Program is used to rename image files with its content
"""SETUP"""
##SETUP
try:
from PIL import Image
from PIL import Image
except ImportError:
import Image
import Image
import pytesseract
import os
import string
import uuid
import hashlib
namearray = []
indexnow = 0
pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' #absolute path to out tesseract instalation
print("OS name:", os.name)
if os.name == "nt": #U are using windows?
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract" #absolute path to out tesseract instalation
PATHSEPARATOR = "\\"
else: #No? NICE
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" #absolute path to teseract instalation
PATHSEPARATOR = "/"
"""FUNCTIONS"""
def ocr(filename):
newname = pytesseract.image_to_string(Image.open(absolutebasepathin+"\\"+filename)) # Use pillow to open an image for pytesseract image2string to use
return newname
def namecleaner(filename):
print("filenameis "+filename)
filename = "_".join(filename.split()) # Get rig of line braks and spaces
filename = filename.replace("__","_") # Cleaning duplicated '_'s
filename = filename.replace(" ","") # Get rig of spaces(For recursion)
filename = filename.replace("\n","") # Get rig of line braks(For recursion)
#Get rid of forbiden characters
filename = filename.replace("<","")
filename = filename.replace(">","")
filename = filename.replace(":","")
filename = filename.replace("\"","")
filename = filename.replace("/","")
filename = filename.replace("\\","")
filename = filename.replace("|","")
filename = filename.replace("?","")
filename = filename.replace("*","")
##################################
outputname = filename.replace(".","") # Get rid of extra dots
print("outputnameis "+outputname)
return outputname
"""CORECODE"""
# Define the input and output directories
basepathin = '.\ImagesToConvert'
print("basepathin = "+basepathin)
basepathout = '.\ImageOutput'
print("basepathout = "+basepathout)
BASEPATHINPUT = "."+PATHSEPARATOR+"ImagesToConvert"
print("BASEPATHINPUT = "+BASEPATHINPUT)
BASEPATHOUTPUT = "."+PATHSEPARATOR+"ImagesOutput"
print("BASEPATHOUTPUT = "+BASEPATHOUTPUT)
# Getting the absolute path to the before named directories
absolutebasepathin = os.path.abspath('.\ImagesToConvert')
print("absolutebasepathin = "+absolutebasepathin)
absolutebasepathout = os.path.abspath('.\ImagesOutput')
print("absolutebasepathout = "+absolutebasepathout)
ABSOLUTEPATHINPUT = os.path.abspath(BASEPATHINPUT)
print("ABSOLUTEPATHINPUT = "+ABSOLUTEPATHINPUT)
ABSOLUTEPATHOUTPUT = os.path.abspath(BASEPATHOUTPUT)
print("ABSOLUTEPATHOUTPUT = "+ABSOLUTEPATHOUTPUT)
##FUNCTIONS
def ocr(filename):
newname = pytesseract.image_to_string(Image.open(ABSOLUTEPATHINPUT+PATHSEPARATOR+filename)) # Use pillow to open an image for pytesseract image2string to use
return newname
def getHashedOut(filename):
return hashlib.md5(Image.open(ABSOLUTEPATHINPUT+PATHSEPARATOR+filename).tobytes())
def namecleaner(filename):
print("filenameis "+filename)
filename = "-".join(filename.split()) # Get rid of line braks and spaces
filename = filename.replace(" ","") # Get rid of spaces(For recursion)
filename = filename.replace("\n","") # Get rid of line braks(For recursion)
#Get rid of forbiden characters(NTFS)
filename = filename.replace("<","")
filename = filename.replace(">","")
filename = filename.replace(":","")
filename = filename.replace("\"","")
filename = filename.replace("/","")
filename = filename.replace("\\","")
filename = filename.replace("|","")
filename = filename.replace("?","")
filename = filename.replace("*","")
##################################
filename = filename.replace(".","") # Get rid of extra dots
while filename.count("--") != 0:
filename = filename.replace("--","-") # Cleaning duplicated '-'s
print("outputnameis "+filename)
return filename
##CORECODE
# List all files in a directory using scandir()
with os.scandir(absolutebasepathin) as entries:
for entry in entries:
if entry.is_file():
# Fill an array with the list
namearray.append(entry.name)
with os.scandir(ABSOLUTEPATHINPUT) as entries:
for entry in entries:
if entry.is_file():
# Fill an array with the list
namearray.append(entry.name)
# Get array length for the loop
arraylength = len(namearray)
print("arraylength = "+str(arraylength))
while indexnow < arraylength:
# Get file name from the array
basename = namearray[indexnow]
print("basename = "+basename)
# Call ocr
newname = ocr(basename)
print("newname = "+newname)
# Call "namecleaner" to get rid of forbiden characters, line breaks and spaces.
cleanname = namecleaner(newname)
print("cleanname = "+cleanname)
if cleanname != "":
if len(cleanname) > 250:
cleanname = cleanname[0:250]
cleanname = cleanname + basename[len(basename)-4:len(basename)]
os.rename(absolutebasepathin+"\\"+basename, absolutebasepathout+"\\"+cleanname)
print(basename+" is now renamed as "+cleanname)
else:
UUIDnow = str(uuid.uuid4())
cleanname = namecleaner(UUIDnow)
cleanname = cleanname + basename[len(basename)-4:len(basename)]
os.rename(absolutebasepathin+"\\"+basename, absolutebasepathout+"\\"+cleanname)
print(basename+" is now "+cleanname)
# Get file name from the array
basename = namearray[indexnow]
print("basename = "+basename)
# Call ocr
newname = ocr(basename)
print("newname = "+newname)
# Call "namecleaner" to get rid of forbiden characters, line breaks and spaces.
cleanname = namecleaner(newname)
print("cleanname = "+cleanname)
if cleanname != "":
if len(cleanname) > 200:
cleanname = cleanname[0:200]
cleanname = cleanname + "." + basename.split(".")[-1]
os.rename(ABSOLUTEPATHINPUT+PATHSEPARATOR+basename, ABSOLUTEPATHOUTPUT+PATHSEPARATOR+cleanname)
print(basename+" is now renamed as "+cleanname)
else:
imgHash = getHashedOut(basename).hexdigest()
cleanname = imgHash + "." + basename.split(".")[-1]
os.rename(ABSOLUTEPATHINPUT+PATHSEPARATOR+basename, ABSOLUTEPATHOUTPUT+PATHSEPARATOR+cleanname)
print(basename+" is now "+cleanname)
indexnow += 1
indexnow += 1
print("All images are given a name")

View File

@ -6,6 +6,7 @@ echo install python: https://www.python.org/downloads/
start https://www.python.org/downloads/
echo also install tesseract https://tesseract-ocr.github.io/tessdoc/Downloads.html
echo recomended the UB Mannheim https://github.com/UB-Mannheim/tesseract/wiki
start https://tesseract-ocr.github.io/tessdoc/Downloads.html
echo wait untill instalation
@ -15,3 +16,6 @@ md ImagesToConvert
md ImagesOutput
pip install Pillow
pip install pytesseract
pip install hashlib
echo Instalation finished

View File

@ -14,5 +14,8 @@ mkdir ImagesToConvert
mkdir ImagesOutput
pip install Pillow
pip install pytesseract
pip install hashlib
sudo cp eng.traineddata /usr/share/tessdata/
echo "Installation finished!"