Python sys.argv [1]索引超出范围(Python sys.argv[1] index out of range)

我有以下代码从一堆PDF文件中提取一些数据。它适用于少数文件然后它给我一个超出范围错误的索引。

__author__ = 'thavan' import os import sys import convertor def _get_file_list(root): """ Get all the pdf files in a given directory. :param root: :return: """ file_list = [] for root, dir, files in os.walk(root): if len(files) > 0: for f in files: file_path = os.path.join(root, f) if file_path.endswith('.pdf'): file_list.append(file_path) return file_list def _match_key(key, match_list): for match in match_list: if match == key: return True return False class SedaScraper(object): process_only = [] # Add here any PDF file with full path if you want to process only these files. def __init__(self): """ Update this list whenever need to fetch a new value. :return: """ self.total_spend_key = ['Total Spend', 'Total spend'] self.total_spend_all_media_key = ['Total spend All Media'] self.outlet_per_all_media_key = ['Press % All Media', 'Internet % All Media', 'Outdoor % All Media', 'TV % All Media', 'Cinema % All Media'] self.no_of_new_create_key = ['No of New Banners', 'No. of New Creatives'] def _get_csv_values(self, pdf_file): """Extracts values from a given PDF file. """ pdf_text = convertor.convert(pdf_file).get_text() lines = pdf_text.split('\n') data = [] sub_data = [] for line in lines: if line.strip() != '': sub_data.append(line.strip()) else: data.append(sub_data) sub_data = [] outlet = data[0][0] company_name = data[1][0] date = data[2][0] start_date = date.split(' to ')[0] end_date = date.split(' to ')[1] for x in range(3, len(data)): try: if _match_key(data[x][0], self.no_of_new_create_key): metric_data = dict(zip(data[x], data[x+1])) break except IndexError: print "Some required text not found. Please check following data... {}".format(data) sys.exit(1) total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives = self._parse_metric(metric_data) print company_name, outlet, start_date, end_date, total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives # change below CSV separator as required. return '|'.join((company_name, outlet, start_date, end_date, total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives)) def _parse_metric(self, metric_data): total_spend = None total_spend_all_media = None outlet_per_all_media = None no_of_new_creatives = None for key, value in metric_data.items(): if _match_key(key, self.total_spend_key): total_spend = value elif _match_key(key, self.total_spend_all_media_key): total_spend_all_media = value elif _match_key(key, self.outlet_per_all_media_key): outlet_per_all_media = value elif _match_key(key, self.no_of_new_create_key): no_of_new_creatives = value return total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives def process(self, root): """Iteratively goes through every PDF file. :param root: :return: """ pdf_list = _get_file_list(root) out_file = open(os.path.join(root, 'output.csv'), 'w') if self.process_only: pdf_list = self.process_only for pdf in pdf_list: print "Processing", pdf csv_line = self._get_csv_values(pdf) out_file.write(csv_line + '\n') print "Output file: {}".format(out_file.name) if __name__ == '__main__': if len(sys.argv) == 1: print "Usage: pdf_scraper.py <path>" sys.exit(1) SedaScraper().process(sys.argv[1])

这是我得到的错误：

C:\Users\soz\Documents\Python\seda_pdf\src>python pdf_scraper.py C:\Users\soz\Do cuments\Python\seda_pdf\2 Processing C:\Users\soz\Documents\Python\seda_pdf\2\2009\Internet\01.pdf Aberdeen Asset Management Internet 01 January 2009 31 January 2009 ┬ú5,505 ┬ú166 ,384 3.31% 5 Processing C:\Users\soz\Documents\Python\seda_pdf\2\2009\Internet\02.pdf Aberdeen Asset Management Internet 01 February 2009 28 February 2009 ┬ú5,906 ┬ú2 26,575 2.61% 5 Processing C:\Users\soz\Documents\Python\seda_pdf\2\2009\Internet\03.pdf Traceback (most recent call last): File "pdf_scraper.py", line 117, in <module> SedaScraper().process(sys.argv[1]) File "pdf_scraper.py", line 109, in process csv_line = self._get_csv_values(pdf) File "pdf_scraper.py", line 65, in _get_csv_values end_date = date.split(' to ')[1] IndexError: list index out of range

我无法弄清楚什么是错的，因为它实际上适用于我拥有的一些PDF文件。我也检查了我的文件，它们没有任何问题。

我的Python知识相当有限，所以如果你的答案可能是dummyproof，我会很感激。

I have the following code to extract some data from a bunch of PDF files. And it works for few files then it gives me an index out of range error.

__author__ = 'thavan' import os import sys import convertor def _get_file_list(root): """ Get all the pdf files in a given directory. :param root: :return: """ file_list = [] for root, dir, files in os.walk(root): if len(files) > 0: for f in files: file_path = os.path.join(root, f) if file_path.endswith('.pdf'): file_list.append(file_path) return file_list def _match_key(key, match_list): for match in match_list: if match == key: return True return False class SedaScraper(object): process_only = [] # Add here any PDF file with full path if you want to process only these files. def __init__(self): """ Update this list whenever need to fetch a new value. :return: """ self.total_spend_key = ['Total Spend', 'Total spend'] self.total_spend_all_media_key = ['Total spend All Media'] self.outlet_per_all_media_key = ['Press % All Media', 'Internet % All Media', 'Outdoor % All Media', 'TV % All Media', 'Cinema % All Media'] self.no_of_new_create_key = ['No of New Banners', 'No. of New Creatives'] def _get_csv_values(self, pdf_file): """Extracts values from a given PDF file. """ pdf_text = convertor.convert(pdf_file).get_text() lines = pdf_text.split('\n') data = [] sub_data = [] for line in lines: if line.strip() != '': sub_data.append(line.strip()) else: data.append(sub_data) sub_data = [] outlet = data[0][0] company_name = data[1][0] date = data[2][0] start_date = date.split(' to ')[0] end_date = date.split(' to ')[1] for x in range(3, len(data)): try: if _match_key(data[x][0], self.no_of_new_create_key): metric_data = dict(zip(data[x], data[x+1])) break except IndexError: print "Some required text not found. Please check following data... {}".format(data) sys.exit(1) total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives = self._parse_metric(metric_data) print company_name, outlet, start_date, end_date, total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives # change below CSV separator as required. return '|'.join((company_name, outlet, start_date, end_date, total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives)) def _parse_metric(self, metric_data): total_spend = None total_spend_all_media = None outlet_per_all_media = None no_of_new_creatives = None for key, value in metric_data.items(): if _match_key(key, self.total_spend_key): total_spend = value elif _match_key(key, self.total_spend_all_media_key): total_spend_all_media = value elif _match_key(key, self.outlet_per_all_media_key): outlet_per_all_media = value elif _match_key(key, self.no_of_new_create_key): no_of_new_creatives = value return total_spend, total_spend_all_media, outlet_per_all_media, no_of_new_creatives def process(self, root): """Iteratively goes through every PDF file. :param root: :return: """ pdf_list = _get_file_list(root) out_file = open(os.path.join(root, 'output.csv'), 'w') if self.process_only: pdf_list = self.process_only for pdf in pdf_list: print "Processing", pdf csv_line = self._get_csv_values(pdf) out_file.write(csv_line + '\n') print "Output file: {}".format(out_file.name) if __name__ == '__main__': if len(sys.argv) == 1: print "Usage: pdf_scraper.py <path>" sys.exit(1) SedaScraper().process(sys.argv[1])

And here is the error I get:

C:\Users\soz\Documents\Python\seda_pdf\src>python pdf_scraper.py C:\Users\soz\Do cuments\Python\seda_pdf\2 Processing C:\Users\soz\Documents\Python\seda_pdf\2\2009\Internet\01.pdf Aberdeen Asset Management Internet 01 January 2009 31 January 2009 ┬ú5,505 ┬ú166 ,384 3.31% 5 Processing C:\Users\soz\Documents\Python\seda_pdf\2\2009\Internet\02.pdf Aberdeen Asset Management Internet 01 February 2009 28 February 2009 ┬ú5,906 ┬ú2 26,575 2.61% 5 Processing C:\Users\soz\Documents\Python\seda_pdf\2\2009\Internet\03.pdf Traceback (most recent call last): File "pdf_scraper.py", line 117, in <module> SedaScraper().process(sys.argv[1]) File "pdf_scraper.py", line 109, in process csv_line = self._get_csv_values(pdf) File "pdf_scraper.py", line 65, in _get_csv_values end_date = date.split(' to ')[1] IndexError: list index out of range

I can't figure out what's wrong because it's actually working for some PDF files I have. Also I checked the files I have, there is nothing wrong with them.

My Python knowledge is rather limited so I'd appreciate if your answer could be dummyproof.