将字典的文本文件转换为csv Python(Convert text file of dictionaries to csv Python)

我有一个由不同字典组成的文本文件，它看起来像这样：

{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "coisas@foo.com", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"} {"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "coisas7@foo.com", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}

每行都是字典，有些字典比其他字典有更多的键，我想将文本文件转换为csv文件。

我有以下代码：

import json import csv import ast def json_to_csv(txt_file, csv_file): lista = [] with open(txt_file, 'rb') as fin: lines = fin.readlines() for line in lines: dict_line = ast.literal_eval(line) lista.append(line) list_json = json.dumps(lista) read_json = json.loads(list_json) header =["feed.accuracy","feed.url","source.geolocation.longitude","event_description.text","raw","destination.geolocation.city","source.ip","classification.taxonomy", "time.observation","destination.geolocation.latitude","destination.ip","source.asn","feed.name","source.geolocation.latitude","time.source","feed.provider", "destination.geolocation.longitude","destination.geolocation.cc","destination.asn","source.abuse_contact","source.geolocation.cc","classification.type"] with open(csv_file, 'wb+') as f: dict_writer = csv.DictWriter(f, header) dict_writer.writeheader() dict_writer.writerows(read_json)

首先，我读取文本文件，然后将其内容转换为JSON，然后尝试将转换后的数据写入csv文件，但是它返回以下错误：

Traceback (most recent call last): File "<pyshell#38>", line 1, in <module> json_to_csv('ctt.txt','ctt.csv') File "C:/Users/Marisa/Documents/json_to_csv.py", line 26, in json_to_csv dict_writer.writerows(read_json) File "C:\Python27\lib\csv.py", line 157, in writerows rows.append(self._dict_to_list(rowdict)) File "C:\Python27\lib\csv.py", line 148, in _dict_to_list + ", ".join([repr(x) for x in wrong_fields])) ValueError: dict contains fields not in fieldnames: u'{', u'"', u'f', u'e', u'e', u'd', u'.', u'a', u'c', u'c', u'u', u'r', u'a', u'c', u'y', u'"', u':', u' ', u'1', u'0', u'0', u'.', u'0', u',', u' ', u'"', u'c', u'l', u'a', u's', u's', u'i', u'f', u'i', u'c', u'a', u't', u'i', u'o', u'n', u'.', u't', u'a', u'x',...

I have a text file composed by different dictionaries and it looks like this:

{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "coisas@foo.com", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"} {"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "coisas7@foo.com", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}

Each line is a dictionary and some dictionaries have more keys than others, and I would like to convert the text file to a csv file.

I have the following code:

import json import csv import ast def json_to_csv(txt_file, csv_file): lista = [] with open(txt_file, 'rb') as fin: lines = fin.readlines() for line in lines: dict_line = ast.literal_eval(line) lista.append(line) list_json = json.dumps(lista) read_json = json.loads(list_json) header =["feed.accuracy","feed.url","source.geolocation.longitude","event_description.text","raw","destination.geolocation.city","source.ip","classification.taxonomy", "time.observation","destination.geolocation.latitude","destination.ip","source.asn","feed.name","source.geolocation.latitude","time.source","feed.provider", "destination.geolocation.longitude","destination.geolocation.cc","destination.asn","source.abuse_contact","source.geolocation.cc","classification.type"] with open(csv_file, 'wb+') as f: dict_writer = csv.DictWriter(f, header) dict_writer.writeheader() dict_writer.writerows(read_json)

First I read the text file, then I convert its content into JSON and then I try to write the converted data into the csv file, however its returning the following error:

Traceback (most recent call last): File "<pyshell#38>", line 1, in <module> json_to_csv('ctt.txt','ctt.csv') File "C:/Users/Marisa/Documents/json_to_csv.py", line 26, in json_to_csv dict_writer.writerows(read_json) File "C:\Python27\lib\csv.py", line 157, in writerows rows.append(self._dict_to_list(rowdict)) File "C:\Python27\lib\csv.py", line 148, in _dict_to_list + ", ".join([repr(x) for x in wrong_fields])) ValueError: dict contains fields not in fieldnames: u'{', u'"', u'f', u'e', u'e', u'd', u'.', u'a', u'c', u'c', u'u', u'r', u'a', u'c', u'y', u'"', u':', u' ', u'1', u'0', u'0', u'.', u'0', u',', u' ', u'"', u'c', u'l', u'a', u's', u's', u'i', u'f', u'i', u'c', u'a', u't', u'i', u'o', u'n', u'.', u't', u'a', u'x',...

最满意答案

你使它变得比它需要的复杂一点，并且你错过了上面你自己的示例数据中的一些字段。我们可以摆脱ast依赖和来回的JSON处理，添加缺少的字段，以下内容将使用您提供的示例数据：

import json import csv def json_to_csv(txt_file, csv_file): lista = [] with open(txt_file, 'r') as in_file: lines = in_file.readlines() for line in lines: try: dict_line = json.loads(line) lista.append(dict_line) except Exception as err: print(err) header = [ "feed.accuracy", "feed.url", "source.geolocation.longitude", "event_description.text", "raw", "destination.geolocation.city", "source.ip", "classification.taxonomy", "time.observation", "destination.geolocation.latitude", "destination.ip", "source.asn", "feed.name", "source.geolocation.latitude", "time.source", "feed.provider", "destination.geolocation.longitude", "destination.geolocation.cc", "destination.asn", "source.abuse_contact", "source.geolocation.cc", "classification.type", 'destination.fqdn', 'source.fqdn', 'source.geolocation.city', 'source.url', 'destination.url' ] with open(csv_file, 'w+') as out_file: dict_writer = csv.DictWriter(out_file, header) dict_writer.writeheader() dict_writer.writerows(lista)

请注意，如果您的实际数据包含的样本中包含的字段更多，则您还需要添加这些字段。

另请注意，如果您的输入数据是正确的JSON数组，例如：

[{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "coisas@foo.com", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"}, {"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "coisas7@foo.com", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}]

整个初始化的解决方案简化了更多with open块变成了：

with open(txt_file, 'r') as in_file: lista = json.load(in_file)

You're making it a little more complicated than it needs to be, and you're missing some of the fields in your own example data above. We can get rid of the ast dependency and the back & forth JSON processing, add in the missing fields, and the following will work with the sample data you've provided:

import json import csv def json_to_csv(txt_file, csv_file): lista = [] with open(txt_file, 'r') as in_file: lines = in_file.readlines() for line in lines: try: dict_line = json.loads(line) lista.append(dict_line) except Exception as err: print(err) header = [ "feed.accuracy", "feed.url", "source.geolocation.longitude", "event_description.text", "raw", "destination.geolocation.city", "source.ip", "classification.taxonomy", "time.observation", "destination.geolocation.latitude", "destination.ip", "source.asn", "feed.name", "source.geolocation.latitude", "time.source", "feed.provider", "destination.geolocation.longitude", "destination.geolocation.cc", "destination.asn", "source.abuse_contact", "source.geolocation.cc", "classification.type", 'destination.fqdn', 'source.fqdn', 'source.geolocation.city', 'source.url', 'destination.url' ] with open(csv_file, 'w+') as out_file: dict_writer = csv.DictWriter(out_file, header) dict_writer.writeheader() dict_writer.writerows(lista)

Note that if your real data has more fields that aren't included in your sample, you'll need to add those, too.

Note too that if your input data were a proper JSON array like:

[{"destination.fqdn": "194-65-57-128.ctt.pt", "feed.provider": "MyFeed", "source.abuse_contact": "coisas@foo.com", "raw": "bWFsd2FyZSwyMTAuMjguNTYuMSxodHRwOi8vd3d3LmN0dC5wdCAsMTk0LTY1LTU3LTEyOC5jdHQucHQsY29pc2FzQGZvby5jb20sMTk0LjIzOS4xNjcuNSx3d3cudmVyeWJhZC5jb20gLHZlcnkudmVyeWJhZC5jb20sLCwsMjAxMC0wMi0xOFQwMDowMDowMCswMDowMA0K", "feed.name": "FileCollector", "destination.geolocation.latitude": 32.2109, "destination.geolocation.cc": "CN", "source.geolocation.longitude": 12.069, "event_description.text": "ctt", "source.ip": "194.239.167.5", "source.geolocation.city": "Frederikssund", "destination.geolocation.city": "Zhenjiang", "destination.url": "http://www.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.verybad.com", "source.fqdn": "very.verybad.com", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "feed.accuracy": 100.0, "time.observation": "2017-07-18T13:15:48+00:00", "destination.geolocation.longitude": 119.4551, "source.geolocation.latitude": 55.8396, "classification.type": "malware", "destination.ip": "210.28.56.1", "time.source": "2010-02-18T00:00:00+00:00", "source.geolocation.cc": "DK"}, {"destination.url": "http://www2.ctt.pt", "classification.taxonomy": "malicious code", "source.url": "http://www.telecom.pt", "feed.provider": "MyFeed", "time.observation": "2017-07-18T13:15:48+00:00", "destination.fqdn": "ctt-pt.mail.protection.outlook.com", "source.abuse_contact": "coisas7@foo.com", "source.geolocation.cc": "TN", "feed.url": "file://localhost/opt/intelmq/teste_ip_url_fqdn.csv", "raw": "YyZjLDI1MS4xNTQuNjUuOSxodHRwOi8vd3d3Mi5jdHQucHQsY3R0LXB0Lm1haWwucHJvdGVjdGlvbi5vdXRsb29rLmNvbSxjb2lzYXM3QGZvby5jb20sMTk3LjEzLjEwNS44LHd3dy50ZWxlY29tLnB0LCwsLCwyMDEwLTAyLTE4VDAwOjAwOjAwKzAwOjAwDQo=", "feed.name": "FileCollector", "classification.type": "c&c", "source.geolocation.latitude": 34.0, "source.geolocation.longitude": 9.0, "destination.ip": "251.154.65.9", "event_description.text": "ctt", "source.ip": "197.13.105.8", "time.source": "2010-02-18T00:00:00+00:00", "feed.accuracy": 100.0}]

the solution simplifies quite a bit more with the whole initial with open block becoming just:

with open(txt_file, 'r') as in_file: lista = json.load(in_file)

更多推荐