生成tfrecords时出现UnicodeDecodeError错误。
生成tfrecords时出现UnicodeDecodeError错误。
我正在尝试使用TensorFlow API生成用于训练目标检测模型的tfrecords和CSV标签,用于检测对象的PNG图像。我正在使用一个教程中的脚本,但是我遇到了以下错误:UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 65: invalid continuation byte
,我不知道该如何解决。你们有什么想法吗?
以下是生成tfrecords的程序:
""" 用法: # 从tensorflow/models/目录下运行: # 创建训练数据: python preprocessing/csv_to_tfrecords.py --csv_input=data/train_labels.csv --output_path=data/train.record # 创建测试数据: python preprocessing/csv_to_tfrecords.py --csv_input=data/test_labels.csv --output_path=data/test.record """ from __future__ import division from __future__ import print_function from __future__ import absolute_import import os import io import pandas as pd import tensorflow as tf from PIL import Image from object_detection.utils import dataset_util from collections import namedtuple, OrderedDict flags = tf.compat.v1.app.flags flags.DEFINE_string('csv_input', '', 'CSV输入路径') flags.DEFINE_string('output_path', '', '输出TFRecord路径') flags.DEFINE_string('image_dir', '', '图像路径') FLAGS = flags.FLAGS # TO-DO:用标签映射替换此部分 def class_text_to_int(row_label): if row_label == 'capsule': return 1 else: None def split(df, group): data = namedtuple('data', ['filename', 'object']) gb = df.groupby(group) return [data(filename, gb.get_group(x)) for filename, x in zip(gb.groups.keys(), gb.groups)] def create_tf_example(group, path): with tf.io.gfile.GFile(os.path.join(path, '{}'.format(group.filename)), 'rb') as fid: encoded_png = fid.read() encoded_png_io = io.BytesIO(encoded_png) image = Image.open(encoded_png_io) width, height = image.size filename = group.filename.encode('utf8') image_format = b'png' xmins = [] xmaxs = [] ymins = [] ymaxs = [] classes_text = [] classes = [] for index, row in group.object.iterrows(): xmins.append(row['xmin'] / width) xmaxs.append(row['xmax'] / width) ymins.append(row['ymin'] / height) ymaxs.append(row['ymax'] / height) classes_text.append(row['class'].encode('utf8')) classes.append(class_text_to_int(row['class'])) tf_example = tf.train.Example(features=tf.train.Features(feature={ 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(filename), 'image/source_id': dataset_util.bytes_feature(filename), 'image/encoded': dataset_util.bytes_feature(encoded_png), 'image/format': dataset_util.bytes_feature(image_format), 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), 'image/object/class/label': dataset_util.int64_list_feature(classes), })) return tf_example def main(_): writer = tf.compat.v1.python_io.TFRecordWriter(FLAGS.output_path) path = os.path.join(FLAGS.image_dir) examples = pd.read_csv(FLAGS.csv_input) grouped = split(examples, 'filename') for group in grouped: tf_example = create_tf_example(group, path) writer.write(tf_example.SerializeToString()) writer.close() output_path = os.path.join(os.getcwd(), FLAGS.output_path) print('成功创建TFRecords:{}'.format(output_path)) if __name__ == '__main__': tf.compat.v1.app.run()
尝试生成tfrecords后,我得到了以下回溯信息:
python preprocessing/csv_to_tfrecords.py --csv_input=data/train_labels.csv --output_path=data/train.record 2021-05-31 21:34:20.376813: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found 2021-05-31 21:34:20.377348: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. Traceback (most recent call last): File "preprocessing/csv_to_tfrecords.py", line 100, intf.compat.v1.app.run() File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\platform\app.py", line 40, in run _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef) File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\absl\app.py", line 303, in run _run_main(main, args) File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\absl\app.py", line 251, in _run_main sys.exit(main(argv)) File "preprocessing/csv_to_tfrecords.py", line 91, in main tf_example = create_tf_example(group, path) File "preprocessing/csv_to_tfrecords.py", line 46, in create_tf_example encoded_png = fid.read() File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\lib\io\file_io.py", line 117, in read self._preread_check() File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\lib\io\file_io.py", line 80, in _preread_check compat.path_to_str(self.__name), 1024 * 512) UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 65: invalid continuation byte
UnicodeDecodeError while generating tfrecords问题的原因是在执行pd.read_csv(FLAGS.csv_input)
这一行时出现了编码问题。解决方法是尝试使用不同的编码方式读取csv文件。
UnicodeDecodeError是一个常见的Python编码错误,它表示在解码Unicode字符串时发生了错误。在这种情况下,可能是因为使用了错误的编码方式读取了csv文件,导致无法正确解码文件中的Unicode字符。
为了解决这个问题,可以尝试使用不同的编码方式进行读取。在提供的链接中(answer)可以找到一些常用的编码方式。可以尝试使用这些编码方式之一,看看是否可以成功读取csv文件。
下面是一个示例代码,演示了如何使用不同的编码方式读取csv文件:
import pandas as pd # 尝试使用不同的编码方式读取csv文件 try: df = pd.read_csv(FLAGS.csv_input, encoding='utf-8') # 使用utf-8编码方式读取 except UnicodeDecodeError: try: df = pd.read_csv(FLAGS.csv_input, encoding='latin1') # 使用latin1编码方式读取 except UnicodeDecodeError: df = pd.read_csv(FLAGS.csv_input, encoding='iso-8859-1') # 使用iso-8859-1编码方式读取 # 继续处理df数据...
通过尝试不同的编码方式,可以找到正确的编码方式并成功读取csv文件,从而解决UnicodeDecodeError while generating tfrecords的问题。