多线程多个PDF文件(Multi threading multiple pdf files)
所以我试图通过一个刮擦文本的函数来运行多个PDF文件,将它与静态字典进行比较,然后将它的关系数据添加到MYSQL中的索引表中。 我研究了多线程,但不确定这是否能实现我所需要的。
这是for循环,我将浏览所有PDF文件
for(String temp: files){ //addToDict(temp,dictonary,conn); //new Scraper(temp,dictonary,conn).run(); Scraper obj=new Scraper(temp,dictonary,conn); Thread T1 =new Thread(obj); T1.start(); //System.out.println((ammountOfFiles--)+" files left"); }这里是我创建的实现可运行的Scraper类
public class Scraper implements Runnable { private String filePath; private HashMap<String,Integer> map; private Connection conn; public Scraper(String file_path,HashMap<String,Integer> dict,Connection connection) { // store parameter for later user filePath =file_path; map = dict; conn = connection; } @Override public void run() { //cut file path so it starts from the data folder int cutPos = filePath.indexOf("Data"); String cutPath = filePath.substring(cutPos); cutPath = cutPath.replaceAll("\\\\", "|"); System.out.println(cutPath+" being scrapped"); // Queries String addSentanceQuery ="INSERT INTO sentance(sentance_ID,sentance_Value) VALUES(Default,?)"; String addContextQuery ="INSERT INTO context(context_ID,word_ID,sentance_ID,pdf_path) VALUES(Default,?,?,?)"; // Prepared Statementes // RESULT SETS ResultSet sentanceKeyRS=null; BodyContentHandler handler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); FileInputStream inputstream = null; try { inputstream = new FileInputStream(new File(filePath)); } catch (FileNotFoundException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } ParseContext pcontext = new ParseContext(); //parsing the document using PDF parser PDFParser pdfparser = new PDFParser(); try { pdfparser.parse(inputstream, handler, metadata, pcontext); } catch (IOException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } catch (SAXException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } catch (TikaException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } //getting the content of the document String fileText = handler.toString(); fileText = fileText.toLowerCase(); //spilt text by new line String sentances [] = fileText.split("\\n"); for(String x : sentances){ x = x.trim(); if(x.isEmpty() || x.matches("\\t+") || x.matches("\\n+") || x.matches("")){ }else{ int sentanceID = 0; //add sentance to db and get the id try (PreparedStatement addSentancePrepare = conn.prepareStatement(addSentanceQuery,Statement.RETURN_GENERATED_KEYS)) { addSentancePrepare.setString(1, x); addSentancePrepare.executeUpdate(); sentanceKeyRS = addSentancePrepare.getGeneratedKeys(); while (sentanceKeyRS.next()) { sentanceID = sentanceKeyRS.getInt(1); } addSentancePrepare.close(); sentanceKeyRS.close(); } catch (SQLException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } String words [] = x.split(" "); for(String y : words){ y = y.trim(); if(y.matches("\\s+") || y.matches("")){ }else if(map.containsKey(y)){ //get ID and put in middle table try (PreparedStatement addContextPrepare = conn.prepareStatement(addContextQuery)) { addContextPrepare.setInt(1, map.get(y)); addContextPrepare.setInt(2, sentanceID); addContextPrepare.setString(3, cutPath); addContextPrepare.executeUpdate(); addContextPrepare.close(); } catch (SQLException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } } } } } try { inputstream.close(); } catch (IOException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } } }我正确地处理这个问题吗? 我从来没有使用过多线程,但它似乎会加快我的程序。
So i'm trying to run multiple PDF files through a function that scrapes the text, compares it to a static dictionary , then adds it's relational data to an index table in MYSQL. I looked into multi-threading but am not sure if this would achieve what I need.
Here is the for loop where I am going through all the PDF files
for(String temp: files){ //addToDict(temp,dictonary,conn); //new Scraper(temp,dictonary,conn).run(); Scraper obj=new Scraper(temp,dictonary,conn); Thread T1 =new Thread(obj); T1.start(); //System.out.println((ammountOfFiles--)+" files left"); }And here is the Scraper class I created that implements runnable
public class Scraper implements Runnable { private String filePath; private HashMap<String,Integer> map; private Connection conn; public Scraper(String file_path,HashMap<String,Integer> dict,Connection connection) { // store parameter for later user filePath =file_path; map = dict; conn = connection; } @Override public void run() { //cut file path so it starts from the data folder int cutPos = filePath.indexOf("Data"); String cutPath = filePath.substring(cutPos); cutPath = cutPath.replaceAll("\\\\", "|"); System.out.println(cutPath+" being scrapped"); // Queries String addSentanceQuery ="INSERT INTO sentance(sentance_ID,sentance_Value) VALUES(Default,?)"; String addContextQuery ="INSERT INTO context(context_ID,word_ID,sentance_ID,pdf_path) VALUES(Default,?,?,?)"; // Prepared Statementes // RESULT SETS ResultSet sentanceKeyRS=null; BodyContentHandler handler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); FileInputStream inputstream = null; try { inputstream = new FileInputStream(new File(filePath)); } catch (FileNotFoundException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } ParseContext pcontext = new ParseContext(); //parsing the document using PDF parser PDFParser pdfparser = new PDFParser(); try { pdfparser.parse(inputstream, handler, metadata, pcontext); } catch (IOException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } catch (SAXException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } catch (TikaException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } //getting the content of the document String fileText = handler.toString(); fileText = fileText.toLowerCase(); //spilt text by new line String sentances [] = fileText.split("\\n"); for(String x : sentances){ x = x.trim(); if(x.isEmpty() || x.matches("\\t+") || x.matches("\\n+") || x.matches("")){ }else{ int sentanceID = 0; //add sentance to db and get the id try (PreparedStatement addSentancePrepare = conn.prepareStatement(addSentanceQuery,Statement.RETURN_GENERATED_KEYS)) { addSentancePrepare.setString(1, x); addSentancePrepare.executeUpdate(); sentanceKeyRS = addSentancePrepare.getGeneratedKeys(); while (sentanceKeyRS.next()) { sentanceID = sentanceKeyRS.getInt(1); } addSentancePrepare.close(); sentanceKeyRS.close(); } catch (SQLException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } String words [] = x.split(" "); for(String y : words){ y = y.trim(); if(y.matches("\\s+") || y.matches("")){ }else if(map.containsKey(y)){ //get ID and put in middle table try (PreparedStatement addContextPrepare = conn.prepareStatement(addContextQuery)) { addContextPrepare.setInt(1, map.get(y)); addContextPrepare.setInt(2, sentanceID); addContextPrepare.setString(3, cutPath); addContextPrepare.executeUpdate(); addContextPrepare.close(); } catch (SQLException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } } } } } try { inputstream.close(); } catch (IOException ex) { Logger.getLogger(Scraper.class.getName()).log(Level.SEVERE, null, ex); } } }Am I going about this correctly? I have never used multi threading but it seems like it would speed up my program.
更多推荐
发布评论