Recently, I’ve been learning about the Statistical language R. To allow me to do some testing against it, I wanted to get a decent sized dataset into SQL Server.
Having found a good dataset, the Airlines data from the 2009 Data Expo, I downloaded it, and proceeded to try and load it into SQL Server. I failed almost straightaway since the downloaded files were UTF-8 in format, and SQL Server cannot directly load them in using Bulk Copy or BCP.
So, I looked around for another option. I could have used SSIS, however, there seemed to be an issue with the Data Conversion module, where it wasn’t saving the changes having converted the fields from UTF-8 to ASCII.
My next choice was to have a look to see if C# would help. It did. The code below allowed me to take the source file (specified in _filePath) and load it into a database table (specified in _connString).
The code works by creating a DataTable into which the data from the CSV file is loaded. The data is then fed into SQL Server in batches of 100,0000 (parameterised using _batchSize).
Here is the code, it’s pretty well performing on my machine, running at 20-25k rows a second.
using Microsoft.VisualBasic.FileIO; using System.Data; using System.Data.SqlClient; public class LoadUTF8CSVIntoSQL { protected const string _filePath = @"C:\Inbox\DataSets\Flights\extract\1987.csv"; protected const string _connString = @"Server=localhost;Database=Datasets;Trusted_Connection=True;"; protected const int _batchSize = 100000; protected static void LoadDataTable(SqlBulkCopy sqlBulkCopy, SqlConnection sqlConnection, DataTable dataTable) { try { sqlBulkCopy.WriteToServer(dataTable); dataTable.Rows.Clear(); System.Console.WriteLine("{0}\t- Batch Written",System.DateTime.Now.ToString()); } catch (System.Exception ex) { System.Console.WriteLine("ERROR : "+ex.Message); } } public static void LoadCsvDataIntoSqlServer(string sFilePath) { try { // This should be the full path var fileName = ""; if (sFilePath.Length == 0) { fileName = _filePath; } else { fileName = sFilePath; } var createdCount = 0; using (var textFieldParser = new TextFieldParser(fileName)) { textFieldParser.TextFieldType = FieldType.Delimited; textFieldParser.Delimiters = new[] { "," }; textFieldParser.HasFieldsEnclosedInQuotes = false; var connectionString = _connString; var dataTable = new DataTable("airlines"); // Specify the columns in the Data Table dataTable.Columns.Add("Yr"); dataTable.Columns.Add("Mth"); dataTable.Columns.Add("DayofMonth"); dataTable.Columns.Add("DayOfWeek"); dataTable.Columns.Add("DepTime"); dataTable.Columns.Add("CRSDepTime"); dataTable.Columns.Add("ArrTime"); dataTable.Columns.Add("CRSArrTime"); dataTable.Columns.Add("UniqueCarrier"); dataTable.Columns.Add("FlightNum"); dataTable.Columns.Add("TailNum"); dataTable.Columns.Add("ActualElapsedTime"); dataTable.Columns.Add("CRSElapsedTime"); dataTable.Columns.Add("AirTime"); dataTable.Columns.Add("ArrDelay"); dataTable.Columns.Add("DepDelay"); dataTable.Columns.Add("Origin"); dataTable.Columns.Add("Dest"); dataTable.Columns.Add("Distance"); dataTable.Columns.Add("TaxiIn"); dataTable.Columns.Add("TaxiOut"); dataTable.Columns.Add("Cancelled"); dataTable.Columns.Add("CancellationCode"); dataTable.Columns.Add("Diverted"); dataTable.Columns.Add("CarrierDelay"); dataTable.Columns.Add("WeatherDelay"); dataTable.Columns.Add("NASDelay"); dataTable.Columns.Add("SecurityDelay"); dataTable.Columns.Add("LateAircraftDelay"); using (var sqlConnection = new SqlConnection(connectionString)) { sqlConnection.Open(); // Initialise Bulk Copy Object var sqlBulkCopy = new SqlBulkCopy(sqlConnection) { DestinationTableName = "airlines" }; // Define column mappings between Data Table and Target Table sqlBulkCopy.ColumnMappings.Add("Yr", "Yr"); sqlBulkCopy.ColumnMappings.Add("Mth", "Mth"); sqlBulkCopy.ColumnMappings.Add("DayofMonth", "DayofMonth"); sqlBulkCopy.ColumnMappings.Add("DayOfWeek", "DayOfWeek"); sqlBulkCopy.ColumnMappings.Add("DepTime", "DepTime"); sqlBulkCopy.ColumnMappings.Add("CRSDepTime", "CRSDepTime"); sqlBulkCopy.ColumnMappings.Add("ArrTime", "ArrTime"); sqlBulkCopy.ColumnMappings.Add("CRSArrTime", "CRSArrTime"); sqlBulkCopy.ColumnMappings.Add("UniqueCarrier", "UniqueCarrier"); sqlBulkCopy.ColumnMappings.Add("FlightNum", "FlightNum"); sqlBulkCopy.ColumnMappings.Add("TailNum", "TailNum"); sqlBulkCopy.ColumnMappings.Add("ActualElapsedTime", "ActualElapsedTime"); sqlBulkCopy.ColumnMappings.Add("CRSElapsedTime", "CRSElapsedTime"); sqlBulkCopy.ColumnMappings.Add("AirTime", "AirTime"); sqlBulkCopy.ColumnMappings.Add("ArrDelay", "ArrDelay"); sqlBulkCopy.ColumnMappings.Add("DepDelay", "DepDelay"); sqlBulkCopy.ColumnMappings.Add("Origin", "Origin"); sqlBulkCopy.ColumnMappings.Add("Dest", "Dest"); sqlBulkCopy.ColumnMappings.Add("Distance", "Distance"); sqlBulkCopy.ColumnMappings.Add("TaxiIn", "TaxiIn"); sqlBulkCopy.ColumnMappings.Add("TaxiOut", "TaxiOut"); sqlBulkCopy.ColumnMappings.Add("Cancelled", "Cancelled"); sqlBulkCopy.ColumnMappings.Add("CancellationCode", "CancellationCode"); sqlBulkCopy.ColumnMappings.Add("Diverted", "Diverted"); sqlBulkCopy.ColumnMappings.Add("CarrierDelay", "CarrierDelay"); sqlBulkCopy.ColumnMappings.Add("WeatherDelay", "WeatherDelay"); sqlBulkCopy.ColumnMappings.Add("NASDelay", "NASDelay"); sqlBulkCopy.ColumnMappings.Add("SecurityDelay", "SecurityDelay"); sqlBulkCopy.ColumnMappings.Add("LateAircraftDelay", "LateAircraftDelay"); // Loop through the CSV and load each set of 100,000 records into a DataTable while (!textFieldParser.EndOfData) { if (createdCount == 0) { textFieldParser.ReadFields(); } dataTable.Rows.Add(textFieldParser.ReadFields()); createdCount++; if (createdCount % _batchSize == 0) { LoadDataTable(sqlBulkCopy, sqlConnection, dataTable); } } // Send a final set to SQL Server LoadDataTable(sqlBulkCopy, sqlConnection, dataTable); sqlConnection.Close(); } } } catch (System.Exception ex) { System.Console.WriteLine("ERROR : "+ex.Message); } } static void Main(string[] args) { try { var filePath = ""; try { filePath = args[0]; } catch (System.Exception) { filePath = ""; } System.Console.WriteLine("{0}\t- Starting load of {1}", System.DateTime.Now.ToString(), filePath); LoadUTF8CSVIntoSQL.LoadCsvDataIntoSqlServer(filePath); System.Console.WriteLine("{0}\t- Completed load of {1}", System.DateTime.Now.ToString(), filePath); } catch (System.Exception ex) { System.Console.WriteLine("ERROR : "+ex.Message); } } }
Hi Nick
If you have access to a Linux/OSX shell you may with to use iconv to switch from UTF-8 to UTF-16 (which BULK INSERT will understand).
The WikiPedia entry documents the tool well enough:
http://en.wikipedia.org/wiki/Iconv
It really us unforgivable that SQL Server doesn’t have this support out of the box.