我有一个Excel文档,该文档大约有250000行,需要永久导入.我已经对该导入进行了许多变体,但是有一些要求: -需要验证每个单元格中的数据 -必须检查数据库中是否存在重复项 -如果存在重复项,请更新条目 -如果不存在任何条目,请插入一个新条目
我已尽可能多地使用并行化,但是我确信必须有某种方法才能使此导入运行得更快.任何帮助或想法将不胜感激.
请注意,数据库位于LAN上,是的,我知道我还没有使用参数化的sql命令(还).
public string BulkUserInsertAndUpdate() { DateTime startTime = DateTime.Now; try { ProcessInParallel(); Debug.WriteLine("Time taken: " + (DateTime.Now - startTime)); } catch (Exception ex) { return ex.Message; } return ""; } private IEnumerable<Row> ReadDocument() { using (SpreadsheetDocument spreadSheetDocument = SpreadsheetDocument.Open(_fileName, false)) { WorkbookPart workbookPart = spreadSheetDocument.WorkbookPart; Sheet ss = workbookPart.Workbook.Descendants<Sheet>().SingleOrDefault(s => s.Name == "User"); if (ss == null) throw new Exception("There was a problem trying to import the file. Please insure that the Sheet's name is: User"); WorksheetPart worksheetPart = (WorksheetPart)workbookPart.GetPartById(ss.Id); OpenXmlReader reader = OpenXmlReader.Create(worksheetPart); StringTablePart = workbookPart.SharedStringTablePart; while (reader.Read()) { if (reader.ElementType == typeof(Row)) { do { if (reader.HasAttributes) { var rowNum = int.Parse(reader.Attributes.First(a => a.LocalName == "r").Value); if (rowNum == 1) continue; var row = (Row)reader.LoadCurrentElement(); yield return row; } } while (reader.ReadNextSibling()); // Skip to the next row break; // We just looped through all the rows so no need to continue reading the worksheet } } } } private void ProcessInParallel() { // Use ConcurrentQueue to enable safe enqueueing from multiple threads. var exceptions = new ConcurrentQueue<Exception>(); Parallel.ForEach(ReadDocument(), (row, loopState) => { List<Cell> cells = row.Descendants<Cell>().ToList(); if (string.IsNullOrEmpty(GetCellValue(cells[0], StringTablePart))) return; // validation code goes here.... try { using (SqlConnection connection = new SqlConnection("user id=sa;password=D3vAdm!n@;server=196.30.181.143;database=TheUnlimitedUSSD;MultipleActiveResultSets=True")) { connection.Open(); SqlCommand command = new SqlCommand("SELECT count(*) FROM dbo.[User] WHERE MobileNumber = '" + mobileNumber + "'", connection); var userCount = (int) command.ExecuteScalar(); if (userCount > 0) { // update command = new SqlCommand("UPDATE [user] SET NewMenu = " + (newMenuIndicator ? "1" : "0") + ", PolicyNumber = '" + policyNumber + "', Status = '" + status + "' WHERE MobileNumber = '" + mobileNumber + "'", connection); command.ExecuteScalar(); Debug.WriteLine("Update cmd"); } else { // insert command = new SqlCommand("INSERT INTO dbo.[User] ( MobileNumber , Status , PolicyNumber , NewMenu ) VALUES ( '" + mobileNumber + "' , '" + status + "' , '" + policyNumber + "' , " + (newMenuIndicator ? "1" : "0") + " )", connection); command.ExecuteScalar(); Debug.WriteLine("Insert cmd"); } } } catch (Exception ex) { exceptions.Enqueue(ex); Debug.WriteLine(ex.Message); loopState.Break(); } }); // Throw the exceptions here after the loop completes. if (exceptions.Count > 0) throw new AggregateException(exceptions); }我建议您进行批量导入,而无需对中间表进行任何验证,然后再通过SQL进行所有验证.现在,电子表格的数据将采用类似SQL表格的结构. 这就是我从Excel和CSV导入300万行以上的工业强度所做的工作,取得了巨大的成功.
I have an Excel document that has about 250000 rows which takes forever to import. I have done many variations of this import, however there are a few requirements: - Need to validate the data in each cell - Must check if a duplicate exists in the database - If a duplicate exists, update the entry - If no entry exists, insert a new one
I have used parallelization as much as possible however I am sure that there must be some way to get this import to run much faster. Any assistance or ideas would be greatly appreciated.
Note that the database is on a LAN, and yes I know I haven't used parameterized sql commands (yet).
public string BulkUserInsertAndUpdate() { DateTime startTime = DateTime.Now; try { ProcessInParallel(); Debug.WriteLine("Time taken: " + (DateTime.Now - startTime)); } catch (Exception ex) { return ex.Message; } return ""; } private IEnumerable<Row> ReadDocument() { using (SpreadsheetDocument spreadSheetDocument = SpreadsheetDocument.Open(_fileName, false)) { WorkbookPart workbookPart = spreadSheetDocument.WorkbookPart; Sheet ss = workbookPart.Workbook.Descendants<Sheet>().SingleOrDefault(s => s.Name == "User"); if (ss == null) throw new Exception("There was a problem trying to import the file. Please insure that the Sheet's name is: User"); WorksheetPart worksheetPart = (WorksheetPart)workbookPart.GetPartById(ss.Id); OpenXmlReader reader = OpenXmlReader.Create(worksheetPart); StringTablePart = workbookPart.SharedStringTablePart; while (reader.Read()) { if (reader.ElementType == typeof(Row)) { do { if (reader.HasAttributes) { var rowNum = int.Parse(reader.Attributes.First(a => a.LocalName == "r").Value); if (rowNum == 1) continue; var row = (Row)reader.LoadCurrentElement(); yield return row; } } while (reader.ReadNextSibling()); // Skip to the next row break; // We just looped through all the rows so no need to continue reading the worksheet } } } } private void ProcessInParallel() { // Use ConcurrentQueue to enable safe enqueueing from multiple threads. var exceptions = new ConcurrentQueue<Exception>(); Parallel.ForEach(ReadDocument(), (row, loopState) => { List<Cell> cells = row.Descendants<Cell>().ToList(); if (string.IsNullOrEmpty(GetCellValue(cells[0], StringTablePart))) return; // validation code goes here.... try { using (SqlConnection connection = new SqlConnection("user id=sa;password=D3vAdm!n@;server=196.30.181.143;database=TheUnlimitedUSSD;MultipleActiveResultSets=True")) { connection.Open(); SqlCommand command = new SqlCommand("SELECT count(*) FROM dbo.[User] WHERE MobileNumber = '" + mobileNumber + "'", connection); var userCount = (int) command.ExecuteScalar(); if (userCount > 0) { // update command = new SqlCommand("UPDATE [user] SET NewMenu = " + (newMenuIndicator ? "1" : "0") + ", PolicyNumber = '" + policyNumber + "', Status = '" + status + "' WHERE MobileNumber = '" + mobileNumber + "'", connection); command.ExecuteScalar(); Debug.WriteLine("Update cmd"); } else { // insert command = new SqlCommand("INSERT INTO dbo.[User] ( MobileNumber , Status , PolicyNumber , NewMenu ) VALUES ( '" + mobileNumber + "' , '" + status + "' , '" + policyNumber + "' , " + (newMenuIndicator ? "1" : "0") + " )", connection); command.ExecuteScalar(); Debug.WriteLine("Insert cmd"); } } } catch (Exception ex) { exceptions.Enqueue(ex); Debug.WriteLine(ex.Message); loopState.Break(); } }); // Throw the exceptions here after the loop completes. if (exceptions.Count > 0) throw new AggregateException(exceptions); }解决方案
I would have suggested that you do a bulk import WITHOUT any validation to an intermediary table, and only then do all the validation via SQL. Your spreadsheet's data will now be in a similiar structure as a SQL table. This is what I have done with industrial strenght imports of 3 million rows + from Excel and CSV with great success.
更多推荐
任何加快此excel导入速度的方法吗?
发布评论