2020-09-10 07:55:00 +00:00
|
|
|
"""This Program is used to rename image files with its content"""
|
|
|
|
|
|
|
|
"""SETUP"""
|
2020-09-09 18:06:33 +00:00
|
|
|
try:
|
|
|
|
from PIL import Image
|
|
|
|
except ImportError:
|
|
|
|
import Image
|
|
|
|
import pytesseract
|
|
|
|
import os
|
|
|
|
import string
|
|
|
|
|
|
|
|
namearray = []
|
|
|
|
indexnow = 0
|
|
|
|
|
2020-09-10 07:55:00 +00:00
|
|
|
pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' #absolute path to out tesseract instalation
|
2020-09-09 18:06:33 +00:00
|
|
|
|
2020-09-10 07:55:00 +00:00
|
|
|
"""FUNCTIONS"""
|
2020-09-09 18:06:33 +00:00
|
|
|
def ocr(filename):
|
2020-09-10 07:55:00 +00:00
|
|
|
newname = pytesseract.image_to_string(Image.open(absolutebasepathin+"\\"+filename)) # Use pillow to open an image for pytesseract image2string to use
|
2020-09-09 18:06:33 +00:00
|
|
|
return newname
|
|
|
|
|
|
|
|
def namecleaner(filename):
|
|
|
|
print("filenameis "+filename)
|
2020-09-10 07:55:19 +00:00
|
|
|
<<<<<<< HEAD
|
2020-09-10 07:55:00 +00:00
|
|
|
filename = "_".join(filename.split()) # Get rig of line braks and spaces
|
|
|
|
filename = filename.replace("__","_") # Cleaning duplicated '_'s
|
|
|
|
filename = filename.replace(" ","") # Get rig of spaces(For recursion)
|
|
|
|
filename = filename.replace("\n","") # Get rig of line braks(For recursion)
|
|
|
|
#Get rid of forbiden characters
|
2020-09-10 07:55:19 +00:00
|
|
|
=======
|
2020-09-09 18:06:33 +00:00
|
|
|
filename = "_".join(filename.split())
|
2020-09-10 07:42:36 +00:00
|
|
|
filename = filename.replace("__","_")
|
2020-09-09 18:06:33 +00:00
|
|
|
filename = filename.replace(" ","")
|
|
|
|
filename = filename.replace("\n","")
|
2020-09-10 07:55:19 +00:00
|
|
|
>>>>>>> 1ab02cad0d56cf6bd1aa7748ddb11ff8220f9919
|
2020-09-09 18:06:33 +00:00
|
|
|
filename = filename.replace("<","")
|
|
|
|
filename = filename.replace(">","")
|
|
|
|
filename = filename.replace(":","")
|
|
|
|
filename = filename.replace("\"","")
|
|
|
|
filename = filename.replace("/","")
|
|
|
|
filename = filename.replace("\\","")
|
|
|
|
filename = filename.replace("|","")
|
|
|
|
filename = filename.replace("?","")
|
|
|
|
filename = filename.replace("*","")
|
2020-09-10 07:55:00 +00:00
|
|
|
##################################
|
|
|
|
filename = filename.replace(".","") # Get rid of extra dots
|
|
|
|
outputname = filename+basename[len(basename)-4:len(basename)] # Get the extension of the file
|
2020-09-09 18:06:33 +00:00
|
|
|
print("outputnameis "+outputname)
|
|
|
|
return outputname
|
|
|
|
|
2020-09-10 07:55:00 +00:00
|
|
|
"""CORECODE"""
|
|
|
|
# Define the input and output directories
|
2020-09-09 18:06:33 +00:00
|
|
|
basepathin = '.\ImagesToConvert'
|
|
|
|
print("basepathin = "+basepathin)
|
|
|
|
basepathout = '.\ImageOutput'
|
|
|
|
print("basepathout = "+basepathout)
|
2020-09-10 07:55:00 +00:00
|
|
|
|
|
|
|
# Getting the absolute path to the before named directories
|
|
|
|
absolutebasepathin = os.path.abspath('.\ImagesToConvert')
|
|
|
|
print("absolutebasepathin = "+absolutebasepathin)
|
2020-09-09 18:06:33 +00:00
|
|
|
absolutebasepathout = os.path.abspath('.\ImagesOutput')
|
|
|
|
print("absolutebasepathout = "+absolutebasepathout)
|
2020-09-10 07:55:00 +00:00
|
|
|
|
|
|
|
# List all files in a directory using scandir()
|
2020-09-09 18:06:33 +00:00
|
|
|
with os.scandir(basepathin) as entries:
|
|
|
|
for entry in entries:
|
|
|
|
if entry.is_file():
|
2020-09-10 07:55:00 +00:00
|
|
|
# Fill an array with the list
|
2020-09-09 18:06:33 +00:00
|
|
|
namearray.append(entry.name)
|
|
|
|
|
2020-09-10 07:55:00 +00:00
|
|
|
# Get array length for the loop
|
2020-09-09 18:06:33 +00:00
|
|
|
arraylength = len(namearray)
|
|
|
|
print("arraylength = "+str(arraylength))
|
|
|
|
|
|
|
|
while indexnow < arraylength:
|
2020-09-10 07:55:00 +00:00
|
|
|
# Get file name from the array
|
2020-09-09 18:06:33 +00:00
|
|
|
basename = namearray[indexnow]
|
|
|
|
print("basename = "+basename)
|
2020-09-10 07:55:00 +00:00
|
|
|
# Call ocr
|
2020-09-09 18:06:33 +00:00
|
|
|
newname = ocr(basename)
|
|
|
|
print("newname = "+newname)
|
2020-09-10 07:55:00 +00:00
|
|
|
# Call "namecleaner" to get rid of forbiden characters, line breaks and spaces.
|
2020-09-09 18:06:33 +00:00
|
|
|
cleanname = namecleaner(newname)
|
|
|
|
print("cleanname = "+cleanname)
|
|
|
|
os.rename(absolutebasepathin+"\\"+basename, absolutebasepathout+"\\"+cleanname)
|
|
|
|
print(basename+" is now renamed as "+cleanname)
|
|
|
|
indexnow = indexnow + 1
|
|
|
|
|
|
|
|
|
|
|
|
print("All images are given a name")
|