Recently, I’ve been learning about the Statistical language R. To allow me to do some testing against it, I wanted to get a decent sized dataset into SQL Server.

Having found a good dataset, the Airlines data from the 2009 Data Expo, I downloaded it, and proceeded to try and load it into SQL Server. I failed almost straightaway since the downloaded files were UTF-8 in format, and SQL Server cannot directly load them in using Bulk Copy or BCP.

So, I looked around for another option. I could have used SSIS, however, there seemed to be an issue with the Data Conversion module, where it wasn’t saving the changes having converted the fields from UTF-8 to ASCII.

My next choice was to have a look to see if C# would help. It did. The code below allowed me to take the source file (specified in _filePath) and load it into a database table (specified in _connString).

The code works by creating a DataTable into which the data from the CSV file is loaded. The data is then fed into SQL Server in batches of 100,0000 (parameterised using _batchSize).

Here is the code, it’s pretty well performing on my machine, running at 20-25k rows a second.

using Microsoft.VisualBasic.FileIO;
using System.Data;
using System.Data.SqlClient;

    public class LoadUTF8CSVIntoSQL
    {
        protected const string _filePath = @"C:\Inbox\DataSets\Flights\extract\1987.csv";
        protected const string _connString = @"Server=localhost;Database=Datasets;Trusted_Connection=True;";
        protected const int _batchSize = 100000;

        protected static void LoadDataTable(SqlBulkCopy sqlBulkCopy, SqlConnection sqlConnection, DataTable dataTable)
        {
            try
            {
                sqlBulkCopy.WriteToServer(dataTable);

                dataTable.Rows.Clear();
                System.Console.WriteLine("{0}\t- Batch Written",System.DateTime.Now.ToString());
            }
            catch (System.Exception ex)
            {
                System.Console.WriteLine("ERROR : "+ex.Message);
            }
        }

        public static void LoadCsvDataIntoSqlServer(string sFilePath)
        {
            try
            {
                // This should be the full path
                var fileName = "";
                if (sFilePath.Length == 0)
                {
                    fileName = _filePath;
                }
                else
                {
                    fileName = sFilePath;
                }

                var createdCount = 0;

                using (var textFieldParser = new TextFieldParser(fileName))
                {
                    textFieldParser.TextFieldType = FieldType.Delimited;
                    textFieldParser.Delimiters = new[] { "," };
                    textFieldParser.HasFieldsEnclosedInQuotes = false;

                    var connectionString = _connString;

                    var dataTable = new DataTable("airlines");

                    // Specify the columns in the Data Table
                    dataTable.Columns.Add("Yr");
                    dataTable.Columns.Add("Mth");
                    dataTable.Columns.Add("DayofMonth");
                    dataTable.Columns.Add("DayOfWeek");
                    dataTable.Columns.Add("DepTime");
                    dataTable.Columns.Add("CRSDepTime");
                    dataTable.Columns.Add("ArrTime");
                    dataTable.Columns.Add("CRSArrTime");
                    dataTable.Columns.Add("UniqueCarrier");
                    dataTable.Columns.Add("FlightNum");
                    dataTable.Columns.Add("TailNum");
                    dataTable.Columns.Add("ActualElapsedTime");
                    dataTable.Columns.Add("CRSElapsedTime");
                    dataTable.Columns.Add("AirTime");
                    dataTable.Columns.Add("ArrDelay");
                    dataTable.Columns.Add("DepDelay");
                    dataTable.Columns.Add("Origin");
                    dataTable.Columns.Add("Dest");
                    dataTable.Columns.Add("Distance");
                    dataTable.Columns.Add("TaxiIn");
                    dataTable.Columns.Add("TaxiOut");
                    dataTable.Columns.Add("Cancelled");
                    dataTable.Columns.Add("CancellationCode");
                    dataTable.Columns.Add("Diverted");
                    dataTable.Columns.Add("CarrierDelay");
                    dataTable.Columns.Add("WeatherDelay");
                    dataTable.Columns.Add("NASDelay");
                    dataTable.Columns.Add("SecurityDelay");
                    dataTable.Columns.Add("LateAircraftDelay");

                    using (var sqlConnection = new SqlConnection(connectionString))
                    {
                        sqlConnection.Open();

                        // Initialise Bulk Copy Object
                        var sqlBulkCopy = new SqlBulkCopy(sqlConnection)
                        {
                            DestinationTableName = "airlines"
                        };

                        // Define column mappings between Data Table and Target Table
                        sqlBulkCopy.ColumnMappings.Add("Yr", "Yr");
                        sqlBulkCopy.ColumnMappings.Add("Mth", "Mth");
                        sqlBulkCopy.ColumnMappings.Add("DayofMonth", "DayofMonth");
                        sqlBulkCopy.ColumnMappings.Add("DayOfWeek", "DayOfWeek");
                        sqlBulkCopy.ColumnMappings.Add("DepTime", "DepTime");
                        sqlBulkCopy.ColumnMappings.Add("CRSDepTime", "CRSDepTime");
                        sqlBulkCopy.ColumnMappings.Add("ArrTime", "ArrTime");
                        sqlBulkCopy.ColumnMappings.Add("CRSArrTime", "CRSArrTime");
                        sqlBulkCopy.ColumnMappings.Add("UniqueCarrier", "UniqueCarrier");
                        sqlBulkCopy.ColumnMappings.Add("FlightNum", "FlightNum");
                        sqlBulkCopy.ColumnMappings.Add("TailNum", "TailNum");
                        sqlBulkCopy.ColumnMappings.Add("ActualElapsedTime", "ActualElapsedTime");
                        sqlBulkCopy.ColumnMappings.Add("CRSElapsedTime", "CRSElapsedTime");
                        sqlBulkCopy.ColumnMappings.Add("AirTime", "AirTime");
                        sqlBulkCopy.ColumnMappings.Add("ArrDelay", "ArrDelay");
                        sqlBulkCopy.ColumnMappings.Add("DepDelay", "DepDelay");
                        sqlBulkCopy.ColumnMappings.Add("Origin", "Origin");
                        sqlBulkCopy.ColumnMappings.Add("Dest", "Dest");
                        sqlBulkCopy.ColumnMappings.Add("Distance", "Distance");
                        sqlBulkCopy.ColumnMappings.Add("TaxiIn", "TaxiIn");
                        sqlBulkCopy.ColumnMappings.Add("TaxiOut", "TaxiOut");
                        sqlBulkCopy.ColumnMappings.Add("Cancelled", "Cancelled");
                        sqlBulkCopy.ColumnMappings.Add("CancellationCode", "CancellationCode");
                        sqlBulkCopy.ColumnMappings.Add("Diverted", "Diverted");
                        sqlBulkCopy.ColumnMappings.Add("CarrierDelay", "CarrierDelay");
                        sqlBulkCopy.ColumnMappings.Add("WeatherDelay", "WeatherDelay");
                        sqlBulkCopy.ColumnMappings.Add("NASDelay", "NASDelay");
                        sqlBulkCopy.ColumnMappings.Add("SecurityDelay", "SecurityDelay");
                        sqlBulkCopy.ColumnMappings.Add("LateAircraftDelay", "LateAircraftDelay");

                        // Loop through the CSV and load each set of 100,000 records into a DataTable
                        while (!textFieldParser.EndOfData)
                        {
                            if (createdCount == 0)
                            {
                                textFieldParser.ReadFields();
                            }
                            dataTable.Rows.Add(textFieldParser.ReadFields());

                            createdCount++;

                            if (createdCount % _batchSize == 0)
                            {
                                LoadDataTable(sqlBulkCopy, sqlConnection, dataTable);
                            }
                        }

                        // Send a final set to SQL Server
                        LoadDataTable(sqlBulkCopy, sqlConnection, dataTable);

                        sqlConnection.Close();
                    }
                }
            }
            catch (System.Exception ex)
            {
                System.Console.WriteLine("ERROR : "+ex.Message);
            }
        }

        static void Main(string[] args)
        {
            try
            {
                var filePath = "";

                try
                {
                    filePath = args[0];
                }
                catch (System.Exception)
                {
                    filePath = "";
                }


                System.Console.WriteLine("{0}\t- Starting load of {1}", System.DateTime.Now.ToString(), filePath);
                LoadUTF8CSVIntoSQL.LoadCsvDataIntoSqlServer(filePath);
                System.Console.WriteLine("{0}\t- Completed load of {1}", System.DateTime.Now.ToString(), filePath);
            }
            catch (System.Exception ex)
            {
                System.Console.WriteLine("ERROR : "+ex.Message);
            }
        }
   
}
Advertisements