Added compatibility with linux, solved a bug and rewriten a bunch of stuff
This commit is contained in:
parent
7952487c2a
commit
5dfe274e58
|
@ -1,94 +1,104 @@
|
||||||
"""This Program is used to rename image files with its content"""
|
#!/usr/bin/python
|
||||||
|
###This Program is used to rename image files with its content
|
||||||
|
|
||||||
"""SETUP"""
|
##SETUP
|
||||||
try:
|
try:
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import Image
|
import Image
|
||||||
import pytesseract
|
import pytesseract
|
||||||
import os
|
import os
|
||||||
import string
|
import string
|
||||||
import uuid
|
import hashlib
|
||||||
|
|
||||||
namearray = []
|
namearray = []
|
||||||
indexnow = 0
|
indexnow = 0
|
||||||
|
|
||||||
pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' #absolute path to out tesseract instalation
|
print("OS name:", os.name)
|
||||||
|
if os.name == "nt": #U are using windows?
|
||||||
|
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract" #absolute path to out tesseract instalation
|
||||||
|
PATHSEPARATOR = "\\"
|
||||||
|
else: #No? NICE
|
||||||
|
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract" #absolute path to teseract instalation
|
||||||
|
PATHSEPARATOR = "/"
|
||||||
|
|
||||||
"""FUNCTIONS"""
|
|
||||||
def ocr(filename):
|
|
||||||
newname = pytesseract.image_to_string(Image.open(absolutebasepathin+"\\"+filename)) # Use pillow to open an image for pytesseract image2string to use
|
|
||||||
return newname
|
|
||||||
|
|
||||||
def namecleaner(filename):
|
|
||||||
print("filenameis "+filename)
|
|
||||||
filename = "_".join(filename.split()) # Get rig of line braks and spaces
|
|
||||||
filename = filename.replace("__","_") # Cleaning duplicated '_'s
|
|
||||||
filename = filename.replace(" ","") # Get rig of spaces(For recursion)
|
|
||||||
filename = filename.replace("\n","") # Get rig of line braks(For recursion)
|
|
||||||
#Get rid of forbiden characters
|
|
||||||
filename = filename.replace("<","")
|
|
||||||
filename = filename.replace(">","")
|
|
||||||
filename = filename.replace(":","")
|
|
||||||
filename = filename.replace("\"","")
|
|
||||||
filename = filename.replace("/","")
|
|
||||||
filename = filename.replace("\\","")
|
|
||||||
filename = filename.replace("|","")
|
|
||||||
filename = filename.replace("?","")
|
|
||||||
filename = filename.replace("*","")
|
|
||||||
##################################
|
|
||||||
outputname = filename.replace(".","") # Get rid of extra dots
|
|
||||||
print("outputnameis "+outputname)
|
|
||||||
return outputname
|
|
||||||
|
|
||||||
"""CORECODE"""
|
|
||||||
# Define the input and output directories
|
# Define the input and output directories
|
||||||
basepathin = '.\ImagesToConvert'
|
BASEPATHINPUT = "."+PATHSEPARATOR+"ImagesToConvert"
|
||||||
print("basepathin = "+basepathin)
|
print("BASEPATHINPUT = "+BASEPATHINPUT)
|
||||||
basepathout = '.\ImageOutput'
|
BASEPATHOUTPUT = "."+PATHSEPARATOR+"ImagesOutput"
|
||||||
print("basepathout = "+basepathout)
|
print("BASEPATHOUTPUT = "+BASEPATHOUTPUT)
|
||||||
|
|
||||||
# Getting the absolute path to the before named directories
|
# Getting the absolute path to the before named directories
|
||||||
absolutebasepathin = os.path.abspath('.\ImagesToConvert')
|
ABSOLUTEPATHINPUT = os.path.abspath(BASEPATHINPUT)
|
||||||
print("absolutebasepathin = "+absolutebasepathin)
|
print("ABSOLUTEPATHINPUT = "+ABSOLUTEPATHINPUT)
|
||||||
absolutebasepathout = os.path.abspath('.\ImagesOutput')
|
ABSOLUTEPATHOUTPUT = os.path.abspath(BASEPATHOUTPUT)
|
||||||
print("absolutebasepathout = "+absolutebasepathout)
|
print("ABSOLUTEPATHOUTPUT = "+ABSOLUTEPATHOUTPUT)
|
||||||
|
|
||||||
|
##FUNCTIONS
|
||||||
|
def ocr(filename):
|
||||||
|
newname = pytesseract.image_to_string(Image.open(ABSOLUTEPATHINPUT+PATHSEPARATOR+filename)) # Use pillow to open an image for pytesseract image2string to use
|
||||||
|
return newname
|
||||||
|
|
||||||
|
def getHashedOut(filename):
|
||||||
|
return hashlib.md5(Image.open(ABSOLUTEPATHINPUT+PATHSEPARATOR+filename).tobytes())
|
||||||
|
|
||||||
|
def namecleaner(filename):
|
||||||
|
print("filenameis "+filename)
|
||||||
|
filename = "-".join(filename.split()) # Get rid of line braks and spaces
|
||||||
|
filename = filename.replace(" ","") # Get rid of spaces(For recursion)
|
||||||
|
filename = filename.replace("\n","") # Get rid of line braks(For recursion)
|
||||||
|
#Get rid of forbiden characters(NTFS)
|
||||||
|
filename = filename.replace("<","")
|
||||||
|
filename = filename.replace(">","")
|
||||||
|
filename = filename.replace(":","")
|
||||||
|
filename = filename.replace("\"","")
|
||||||
|
filename = filename.replace("/","")
|
||||||
|
filename = filename.replace("\\","")
|
||||||
|
filename = filename.replace("|","")
|
||||||
|
filename = filename.replace("?","")
|
||||||
|
filename = filename.replace("*","")
|
||||||
|
##################################
|
||||||
|
filename = filename.replace(".","") # Get rid of extra dots
|
||||||
|
while filename.count("--") != 0:
|
||||||
|
filename = filename.replace("--","-") # Cleaning duplicated '-'s
|
||||||
|
print("outputnameis "+filename)
|
||||||
|
return filename
|
||||||
|
|
||||||
|
##CORECODE
|
||||||
# List all files in a directory using scandir()
|
# List all files in a directory using scandir()
|
||||||
with os.scandir(absolutebasepathin) as entries:
|
with os.scandir(ABSOLUTEPATHINPUT) as entries:
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
if entry.is_file():
|
if entry.is_file():
|
||||||
# Fill an array with the list
|
# Fill an array with the list
|
||||||
namearray.append(entry.name)
|
namearray.append(entry.name)
|
||||||
|
|
||||||
# Get array length for the loop
|
# Get array length for the loop
|
||||||
arraylength = len(namearray)
|
arraylength = len(namearray)
|
||||||
print("arraylength = "+str(arraylength))
|
print("arraylength = "+str(arraylength))
|
||||||
|
|
||||||
while indexnow < arraylength:
|
while indexnow < arraylength:
|
||||||
# Get file name from the array
|
# Get file name from the array
|
||||||
basename = namearray[indexnow]
|
basename = namearray[indexnow]
|
||||||
print("basename = "+basename)
|
print("basename = "+basename)
|
||||||
# Call ocr
|
# Call ocr
|
||||||
newname = ocr(basename)
|
newname = ocr(basename)
|
||||||
print("newname = "+newname)
|
print("newname = "+newname)
|
||||||
# Call "namecleaner" to get rid of forbiden characters, line breaks and spaces.
|
# Call "namecleaner" to get rid of forbiden characters, line breaks and spaces.
|
||||||
cleanname = namecleaner(newname)
|
cleanname = namecleaner(newname)
|
||||||
print("cleanname = "+cleanname)
|
print("cleanname = "+cleanname)
|
||||||
if cleanname != "":
|
if cleanname != "":
|
||||||
if len(cleanname) > 250:
|
if len(cleanname) > 200:
|
||||||
cleanname = cleanname[0:250]
|
cleanname = cleanname[0:200]
|
||||||
cleanname = cleanname + basename[len(basename)-4:len(basename)]
|
cleanname = cleanname + "." + basename.split(".")[-1]
|
||||||
os.rename(absolutebasepathin+"\\"+basename, absolutebasepathout+"\\"+cleanname)
|
os.rename(ABSOLUTEPATHINPUT+PATHSEPARATOR+basename, ABSOLUTEPATHOUTPUT+PATHSEPARATOR+cleanname)
|
||||||
print(basename+" is now renamed as "+cleanname)
|
print(basename+" is now renamed as "+cleanname)
|
||||||
else:
|
else:
|
||||||
UUIDnow = str(uuid.uuid4())
|
imgHash = getHashedOut(basename).hexdigest()
|
||||||
cleanname = namecleaner(UUIDnow)
|
cleanname = imgHash + "." + basename.split(".")[-1]
|
||||||
cleanname = cleanname + basename[len(basename)-4:len(basename)]
|
os.rename(ABSOLUTEPATHINPUT+PATHSEPARATOR+basename, ABSOLUTEPATHOUTPUT+PATHSEPARATOR+cleanname)
|
||||||
os.rename(absolutebasepathin+"\\"+basename, absolutebasepathout+"\\"+cleanname)
|
print(basename+" is now "+cleanname)
|
||||||
print(basename+" is now "+cleanname)
|
|
||||||
|
|
||||||
indexnow += 1
|
indexnow += 1
|
||||||
|
|
||||||
|
|
||||||
print("All images are given a name")
|
print("All images are given a name")
|
||||||
|
|
|
@ -6,6 +6,7 @@ echo install python: https://www.python.org/downloads/
|
||||||
start https://www.python.org/downloads/
|
start https://www.python.org/downloads/
|
||||||
|
|
||||||
echo also install tesseract https://tesseract-ocr.github.io/tessdoc/Downloads.html
|
echo also install tesseract https://tesseract-ocr.github.io/tessdoc/Downloads.html
|
||||||
|
echo recomended the UB Mannheim https://github.com/UB-Mannheim/tesseract/wiki
|
||||||
start https://tesseract-ocr.github.io/tessdoc/Downloads.html
|
start https://tesseract-ocr.github.io/tessdoc/Downloads.html
|
||||||
|
|
||||||
echo wait untill instalation
|
echo wait untill instalation
|
||||||
|
@ -15,3 +16,6 @@ md ImagesToConvert
|
||||||
md ImagesOutput
|
md ImagesOutput
|
||||||
pip install Pillow
|
pip install Pillow
|
||||||
pip install pytesseract
|
pip install pytesseract
|
||||||
|
pip install hashlib
|
||||||
|
|
||||||
|
echo Instalation finished
|
||||||
|
|
3
setup.sh
3
setup.sh
|
@ -14,5 +14,8 @@ mkdir ImagesToConvert
|
||||||
mkdir ImagesOutput
|
mkdir ImagesOutput
|
||||||
pip install Pillow
|
pip install Pillow
|
||||||
pip install pytesseract
|
pip install pytesseract
|
||||||
|
pip install hashlib
|
||||||
|
|
||||||
|
sudo cp eng.traineddata /usr/share/tessdata/
|
||||||
|
|
||||||
echo "Installation finished!"
|
echo "Installation finished!"
|
||||||
|
|
Loading…
Reference in New Issue