在Python中,使用pymysql.connect()连接MySql数据库时,指定了charset='utf8mb4',旨在支持UTF-8编码的字符,以正确处理中文数据。然而,在实际执行查询语句时,却发现语句中出现了乱码。
2、解决方案
为了解决上述问题,需要在执行查询语句之前,先执行"SET NAMES utf8mb4"语句,以显式地将数据库的字符集设置为UTF-8。这将确保查询语句中的中文数据能够正确地解析和处理。
以下是修改后的代码示例:
import pymysql
import pymysql.cursors
import os
import win32com.client
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
import pyPdf
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.plaintext.writer import PlaintextWriter
import nltk
import zipfile, re
import time
# READING DOC FILE FROM REMOTE LOCATION
def readfilesq9(n):
connection = pymysql.connect(host='xxx.xxx.x.xxx',
user='abcd',
passwd='pwd1',
db='rep_db',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
list1=[]
with connection.cursor() as cursor:
# Set the database character set to UTF-8
cursor.execute("SET NAMES utf8mb4")
# Read a single record
sql = "SELECT candidateid,cnd.FirstName, cnd.LastName,Concat('\xxx.xxx.x.xxx\File\Cand_Res/',orgguid,'/',DATE_FORMAT(cnd.createddate,'%Y%m'),'/',candidateguid,'/',Resume) as ResumePath from candidate cnd join mstorganization org on cnd.orgid = org.OrgId where Resume <> '' and Resume is not null order by cnd.modifieddate limit 100000"
cursor.execute(sql)
result = cursor.fetchall()
# Iterate over the results and process the data
for i in result:
try:
# Extract the file path and convert it to ASCII
item_1=i.items()
item_2=item_1[2][1]
item_3=item_2.index("/")
file1=item_2[item_2:]
string1='\\xxx.xxx.x.xxx\Resumes\Cand_Res'
file1e=file1.encode('ascii', 'ignore')
urls=file1e.replace("/","\")
file_full=string1+urls
time.sleep(1)
# Construct the full file path and extract the text
file_name1=file_full
text = ""
if ".doc" in file_name1:
if ".docx" not in file_name1:
try:
doc = win32com.client.GetObject(file_name1)
text = doc.Range().Text
text1=text.encode('ascii','ignore')
text_word=text1.split()
list1.append(text_word)
except:
print("DOC ISSUE")
elif ".docx" in file_name1:
docx1=zipfile.ZipFile(file_name1)
content = docx1.read('word/document.xml').decode('utf-8')
cleaned = re.sub('<(.|\n)*?>','',content).encode('ascii','ignore')
cleaned_word=cleaned.split()
list1.append(cleaned_word)
else:
print("Not a Doc file")
except:
print("OOPS1")
通过在执行查询语句之前显式地设置数据库的字符集,可以确保查询语句中的中文数据能够正确地解析和处理,避免出现乱码问题。