Tag Archive: SQL Server


Recently, I’ve been learning about the Statistical language R. To allow me to do some testing against it, I wanted to get a decent sized dataset into SQL Server.

Having found a good dataset, the Airlines data from the 2009 Data Expo, I downloaded it, and proceeded to try and load it into SQL Server. I failed almost straightaway since the downloaded files were UTF-8 in format, and SQL Server cannot directly load them in using Bulk Copy or BCP.

So, I looked around for another option. I could have used SSIS, however, there seemed to be an issue with the Data Conversion module, where it wasn’t saving the changes having converted the fields from UTF-8 to ASCII.

My next choice was to have a look to see if C# would help. It did. The code below allowed me to take the source file (specified in _filePath) and load it into a database table (specified in _connString).

The code works by creating a DataTable into which the data from the CSV file is loaded. The data is then fed into SQL Server in batches of 100,0000 (parameterised using _batchSize).

Here is the code, it’s pretty well performing on my machine, running at 20-25k rows a second.

using Microsoft.VisualBasic.FileIO;
using System.Data;
using System.Data.SqlClient;

    public class LoadUTF8CSVIntoSQL
    {
        protected const string _filePath = @"C:\Inbox\DataSets\Flights\extract\1987.csv";
        protected const string _connString = @"Server=localhost;Database=Datasets;Trusted_Connection=True;";
        protected const int _batchSize = 100000;

        protected static void LoadDataTable(SqlBulkCopy sqlBulkCopy, SqlConnection sqlConnection, DataTable dataTable)
        {
            try
            {
                sqlBulkCopy.WriteToServer(dataTable);

                dataTable.Rows.Clear();
                System.Console.WriteLine("{0}\t- Batch Written",System.DateTime.Now.ToString());
            }
            catch (System.Exception ex)
            {
                System.Console.WriteLine("ERROR : "+ex.Message);
            }
        }

        public static void LoadCsvDataIntoSqlServer(string sFilePath)
        {
            try
            {
                // This should be the full path
                var fileName = "";
                if (sFilePath.Length == 0)
                {
                    fileName = _filePath;
                }
                else
                {
                    fileName = sFilePath;
                }

                var createdCount = 0;

                using (var textFieldParser = new TextFieldParser(fileName))
                {
                    textFieldParser.TextFieldType = FieldType.Delimited;
                    textFieldParser.Delimiters = new[] { "," };
                    textFieldParser.HasFieldsEnclosedInQuotes = false;

                    var connectionString = _connString;

                    var dataTable = new DataTable("airlines");

                    // Specify the columns in the Data Table
                    dataTable.Columns.Add("Yr");
                    dataTable.Columns.Add("Mth");
                    dataTable.Columns.Add("DayofMonth");
                    dataTable.Columns.Add("DayOfWeek");
                    dataTable.Columns.Add("DepTime");
                    dataTable.Columns.Add("CRSDepTime");
                    dataTable.Columns.Add("ArrTime");
                    dataTable.Columns.Add("CRSArrTime");
                    dataTable.Columns.Add("UniqueCarrier");
                    dataTable.Columns.Add("FlightNum");
                    dataTable.Columns.Add("TailNum");
                    dataTable.Columns.Add("ActualElapsedTime");
                    dataTable.Columns.Add("CRSElapsedTime");
                    dataTable.Columns.Add("AirTime");
                    dataTable.Columns.Add("ArrDelay");
                    dataTable.Columns.Add("DepDelay");
                    dataTable.Columns.Add("Origin");
                    dataTable.Columns.Add("Dest");
                    dataTable.Columns.Add("Distance");
                    dataTable.Columns.Add("TaxiIn");
                    dataTable.Columns.Add("TaxiOut");
                    dataTable.Columns.Add("Cancelled");
                    dataTable.Columns.Add("CancellationCode");
                    dataTable.Columns.Add("Diverted");
                    dataTable.Columns.Add("CarrierDelay");
                    dataTable.Columns.Add("WeatherDelay");
                    dataTable.Columns.Add("NASDelay");
                    dataTable.Columns.Add("SecurityDelay");
                    dataTable.Columns.Add("LateAircraftDelay");

                    using (var sqlConnection = new SqlConnection(connectionString))
                    {
                        sqlConnection.Open();

                        // Initialise Bulk Copy Object
                        var sqlBulkCopy = new SqlBulkCopy(sqlConnection)
                        {
                            DestinationTableName = "airlines"
                        };

                        // Define column mappings between Data Table and Target Table
                        sqlBulkCopy.ColumnMappings.Add("Yr", "Yr");
                        sqlBulkCopy.ColumnMappings.Add("Mth", "Mth");
                        sqlBulkCopy.ColumnMappings.Add("DayofMonth", "DayofMonth");
                        sqlBulkCopy.ColumnMappings.Add("DayOfWeek", "DayOfWeek");
                        sqlBulkCopy.ColumnMappings.Add("DepTime", "DepTime");
                        sqlBulkCopy.ColumnMappings.Add("CRSDepTime", "CRSDepTime");
                        sqlBulkCopy.ColumnMappings.Add("ArrTime", "ArrTime");
                        sqlBulkCopy.ColumnMappings.Add("CRSArrTime", "CRSArrTime");
                        sqlBulkCopy.ColumnMappings.Add("UniqueCarrier", "UniqueCarrier");
                        sqlBulkCopy.ColumnMappings.Add("FlightNum", "FlightNum");
                        sqlBulkCopy.ColumnMappings.Add("TailNum", "TailNum");
                        sqlBulkCopy.ColumnMappings.Add("ActualElapsedTime", "ActualElapsedTime");
                        sqlBulkCopy.ColumnMappings.Add("CRSElapsedTime", "CRSElapsedTime");
                        sqlBulkCopy.ColumnMappings.Add("AirTime", "AirTime");
                        sqlBulkCopy.ColumnMappings.Add("ArrDelay", "ArrDelay");
                        sqlBulkCopy.ColumnMappings.Add("DepDelay", "DepDelay");
                        sqlBulkCopy.ColumnMappings.Add("Origin", "Origin");
                        sqlBulkCopy.ColumnMappings.Add("Dest", "Dest");
                        sqlBulkCopy.ColumnMappings.Add("Distance", "Distance");
                        sqlBulkCopy.ColumnMappings.Add("TaxiIn", "TaxiIn");
                        sqlBulkCopy.ColumnMappings.Add("TaxiOut", "TaxiOut");
                        sqlBulkCopy.ColumnMappings.Add("Cancelled", "Cancelled");
                        sqlBulkCopy.ColumnMappings.Add("CancellationCode", "CancellationCode");
                        sqlBulkCopy.ColumnMappings.Add("Diverted", "Diverted");
                        sqlBulkCopy.ColumnMappings.Add("CarrierDelay", "CarrierDelay");
                        sqlBulkCopy.ColumnMappings.Add("WeatherDelay", "WeatherDelay");
                        sqlBulkCopy.ColumnMappings.Add("NASDelay", "NASDelay");
                        sqlBulkCopy.ColumnMappings.Add("SecurityDelay", "SecurityDelay");
                        sqlBulkCopy.ColumnMappings.Add("LateAircraftDelay", "LateAircraftDelay");

                        // Loop through the CSV and load each set of 100,000 records into a DataTable
                        while (!textFieldParser.EndOfData)
                        {
                            if (createdCount == 0)
                            {
                                textFieldParser.ReadFields();
                            }
                            dataTable.Rows.Add(textFieldParser.ReadFields());

                            createdCount++;

                            if (createdCount % _batchSize == 0)
                            {
                                LoadDataTable(sqlBulkCopy, sqlConnection, dataTable);
                            }
                        }

                        // Send a final set to SQL Server
                        LoadDataTable(sqlBulkCopy, sqlConnection, dataTable);

                        sqlConnection.Close();
                    }
                }
            }
            catch (System.Exception ex)
            {
                System.Console.WriteLine("ERROR : "+ex.Message);
            }
        }

        static void Main(string[] args)
        {
            try
            {
                var filePath = "";

                try
                {
                    filePath = args[0];
                }
                catch (System.Exception)
                {
                    filePath = "";
                }


                System.Console.WriteLine("{0}\t- Starting load of {1}", System.DateTime.Now.ToString(), filePath);
                LoadUTF8CSVIntoSQL.LoadCsvDataIntoSqlServer(filePath);
                System.Console.WriteLine("{0}\t- Completed load of {1}", System.DateTime.Now.ToString(), filePath);
            }
            catch (System.Exception ex)
            {
                System.Console.WriteLine("ERROR : "+ex.Message);
            }
        }
   
}
Advertisements

Parallel SQL in C#

So, I’ve been wanting to get back to playing with C# for a while, and finally have had the opportunity.

I’ve also been wanting to play with the Task library in .NET and see if I could get it to do something interesting, well below is the result.

The code below, running in a .NET 4 project, will run two SQL SELECT statements against the AdventureWorks2012 database.

There are three tasks in here, ParallelTask 1 and 2, and a timing task. The Parallel task takes a Connection String and a query as inputs, and passes out a Status Message. One of the important points with a task is that the task has to be self contained. This is why the connection is instantiated within the task.

I also added in a Timing task (ParallelTiming) so I could pass out a ping message.

The whole thing is controlled by the code in the main section, which is used to start the three tasks, with their appropriate parameters.

After this it awaits the tasks completing, then passes out the resulting return messages.

Try it out; it’s good fun and all you need is SQL Server, AdventureWorks and something to build C# projects.

DISCLAIMER: I am by no means a C# expert… 🙂

You can download the code here

Have fun!

/// Parallel_SQL demonstration code
/// From Nick Haslam
/// http://blog.nhaslam.com
/// 16/9/2013

using System;
using System.Collections.Generic;
using System.Data.SqlClient;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Parallel_SQL
{
class Program
{
///
/// First Parallel task
///

///Connection string details ///Query to execute ///Status message to pass back ///
static Task ParallelTask1(string sConnString, string sQuery, Action StatusMessage)
{
return Task.Factory.StartNew(() =>
{
SqlConnection conn = new SqlConnection(sConnString);
conn.Open();

StatusMessage(“Running Query”);

SqlDataReader reader = null;
SqlCommand sqlCommand = new SqlCommand(sQuery, conn);

reader = sqlCommand.ExecuteReader();

while (reader.Read())
{
StatusMessage(reader[0].ToString());
}

return “Task 1 Complete”;
});
}

///
/// Second Parallel task
///

///Connection string details ///Query to execute ///Status message to pass back ///
static Task ParallelTask2(string sConnString, string sQuery, Action StatusMessage)
{
return Task.Factory.StartNew(() =>
{
SqlConnection conn = new SqlConnection(sConnString);
conn.Open();
StatusMessage(“Running Query”);

SqlDataReader reader = null;
SqlCommand sqlCommand = new SqlCommand(sQuery, conn);

reader = sqlCommand.ExecuteReader();

while (reader.Read())
{
StatusMessage(reader[0].ToString());
}

return “Task 2 Complete”;
});
}

///
/// Timing Task
///

///Milliseconds between ping ///Status message to pass back ///
static Task ParallelTiming(int iMSPause, Action StatusMessage)
{
return Task.Factory.StartNew(() =>
{
for (int i = 0; i < 10; i++) { System.Threading.Thread.Sleep(iMSPause); StatusMessage(“******************** PING ********************”); } return “Timing task done”; }); } static void Main(string[] args) { string sConnString = “server=.; Trusted_Connection=yes; database=AdventureWorks2012;”; try { var Task1Control = ParallelTask1(sConnString, “SELECT top 500 TransactionID FROM Production.TransactionHistory”, (update) =>
{
Console.WriteLine(String.Format(“{0} – {1}”, DateTime.Now, update));
});
var Task2Control = ParallelTask2(sConnString,
“SELECT top 500 SalesOrderDetailID FROM sales.SalesOrderDetail”, (update) =>
{
Console.WriteLine(String.Format(“{0} – \t\t{1}”, DateTime.Now, update));
});

var TimingTaskControl = ParallelTiming(250, (update) =>
{
Console.WriteLine(String.Format(“{0} – \t\t\t{1}”, DateTime.Now, update));
});

// Await Completion of the tasks
Console.WriteLine(“Task 1 Status – {0}”, Task1Control.Result);
Console.WriteLine(“Task 2 Status – {0}”, Task2Control.Result);
Console.WriteLine(“Timing Task Status – {0}”, TimingTaskControl.Result);
}
catch (Exception e)
{
Console.WriteLine(e.ToString());
}
Console.ReadKey();
}
}
}

On Monday 1st July 2013, the UK Police launched a hub for accessing their data. Read the announcement here: http://www.data.gov.uk/blog/the-launch-of-datapoliceuk

You can find the portal here: http://data.police.uk/

The data extracts are here: http://data.police.uk/data/

I wanted to have a look at this and see what can be done with it, but there are dozens of csv files to load in to get the full view, so this is a quick post on what I’ve been doing this evening. 🙂

Having found an interesting post on loading multiple CSV files in T-SQL , I’ve updated it to load the files in. First though, we need to create a table to take the data.

Create the Crime_Info table

An initial table to receive the data is shown below:

CREATE TABLE [dbo].[Crime_Info]( 
    [Crime ID] [varchar](100) NULL, 
    [Month] [varchar](100) NULL, 
    [Reported by] [varchar](100) NULL, 
    [Falls within] [varchar](100) NULL, 
    [Longitude] [float] NULL, 
    [Latitude] [float] NULL, 
    [Location] [varchar](100) NULL, 
    [LSOA code] [varchar](100) NULL, 
    [LSOA name] [varchar](100) NULL, 
    [Crime type] [varchar](200) NULL, 
    [Last outcome category] [varchar](max) NULL, 
    [Context] [varchar](max) NULL 
) ON [PRIMARY] WITH (DATA_COMPRESSION = PAGE )

Note that I Page compressed the table as I wanted to reduce the space it took up. It went down from 3GB to around 1.5GB.

Load the Crime files

I’ve modified the bulk load script mentioned above, and you can download it here:

https://dl.dropboxusercontent.com/u/2765900/Load_UK_Crime_Data.sql

Once loaded, I had 15,668,549 rows of data, sitting in a 1.2GB database.

Then, by playing with PowerPivot you can create ‘fun’ visualisations like this.

image

Have fun, and as they say on Crimewatch, “don’t have nightmares, do sleep well”.

20121003-200545.jpg Welcome to TSql2sday issue #35, this time hosted by me…

It’s a bit last minute, as I stepped in to help Adam out, so bear with me. As always, thanks to Adam for starting this off, I’ve posted a few articles on previous runs, and have found other people’s posts to be really interesting. I hope this follows in the same way.

Over the past couple of days I’ve been attending a training course in Paris, and one evening, to relax I watched ‘Soylent Green‘, a classic science fiction film. If you’ve not seen it, I recommend it, and go and watch it …

So, what I’d like to know is, what is your most horrifying discovery from your work with SQL Server?

We all like to read stories of other people’s misfortunes and, in some ways they help to make us better people by learning from them. Hopefully, there is nothing as bad as Charlton Heston’s discovery, but there may be in its own way.

A couple of extra thoughts for motivational thinking…

Soylent Brown – You did a post, Great Job!!

Soylent Orange – You did a post, it made me wince!

Soylent Green  – You did a post, it made me wince, and it included some T-SQL.

Do you have the words straight?

Here are the rules as usual: If you would like to participate in T-SQL Tuesday please be sure to follow the rules below:

  • Your blog post must be published between Tuesday, October 9th 2012 00:00:00 GMT and Wednesday, October 10th 2012 00:00:00 GMT.
  • Include the T-SQL Tuesday logo (above) and hyperlink it back to this post.
  • If you don’t see your post in trackbacks, add the link to the comments below.
  • If you are on Twitter please tweet your blog using the #TSQL2sDay hashtag. I can be contacted there as @nhaslam, in case you have questions or problems with comments/trackback.

Thank you all for participating, and special thanks to Adam Machanic (b|t) for all his help and for continuing this series!

Thanks for posting, and I’ll have a follow-up post listing all the contributions as soon as I can.

Over the past few evenings, I’ve been playing with SQLIO, to get an idea of how SSD compares to a couple of servers (one quite old, one a bit newer) that I have access too.

SQLIO can be used to do performance testing of an IO subsystem, prior to deploying SQL Server onto it. It doesn’t actually do anything specifically with SQL, it’s just IO.

If you haven’t looked at SQLIO, I would highly recommend looking at these websites:

http://www.sqlskills.com/BLOGS/PAUL/post/Cool-free-tool-to-parse-and-analyze-SQLIO-results.aspx

http://tools.davidklee.net/sqlio/sqlio-analyzer.aspx

The SQLIO Analyser, created by David Klee, is amazing. It allows you to run the SQLIO package (a preconfigured one is available on the site) and submit the results. It then generates an Excel file that contains various metrics. It’s nice!

Running on my Laptop…

Having run the pre-built package on my laptop, I got the following metrics out of it. As you can see, it’s an SSD  (Crucial M4 SSD), and pretty nippy.

image

image

Interesting metrics here, and one of the key benefits of an SSD, is that regardless of what you are doing, the average latency is so low. For these tests, I was getting:

Avg. Metrics Sequential Read Random Read Sequential Write Random Write
Latency (ms) 19.28ms 18.38ms 23.21ms 51.51ms
Avg IOPs 3777 3493 2930 1340
MB/s 236.07 218.3 183 83.7

Running on an older server

So, running this on an older server, connected to a much older (6-8+ years old) SAN gave me these results. You can see that the metrics are all much lower, and there is a much wider spread of for all the metrics, and that is down to the spinning disks.

image

image

As you can see from the metrics below, there is a significant drop in the performance of the server, a lot more variance across the load types.

Avg. Metrics Sequential Read Random Read Sequential Write Random Write
Latency (ms) 24.81ms 66.79 373 260
Avg IOPs 1928 710 186 210
MB/s 120 44.3 11.6 13.14

Slightly newer Server

So, next I had the SQLIO package running on a slightly newer server (with a higher spec I/O system, I was told), which gave the following results.

image

image

As expected, this did give generally better results, it is interesting that Sequential read had better throughput on the older server.

Avg. Metrics Sequential Read Random Read Sequential Write Random Write
Latency (ms) 35.13 44.17 41.81 77.44
Avg IOPs 1474 1021 1314 794
MB/s 92.7 63.8 82.8 49.6

Cracking open VMware

Since I use VMware Workstation for compartmentalising projects on my laptop, I thought I’d run this against a VM. The VM was running on the SSD (at the top of the post), so I could see how much of an impact the VMware drivers had on the process. This gave some interesting results, which you can see below. Obviously there is something screwy going on here, it’s not likely that the VM can perform that much faster than the drive it’s sitting on. Would be nice if it could though…

image

image

Avg. Metrics Sequential Read Random Read Sequential Write Random Write
Latency (ms) 7.8 7.5 7.63 7.71
Avg IOPs 12435 13119 15481 14965
MB/s 777 819 967 935

While the whole process was running, Task manager on the host machine was sitting at around 0-2% for disk utilisation, but the CPU was sitting at 50-60%. So, it was hardly touching the disk.

image

Conclusion

Just to summarise this, in case you didn’t already know, SSD’s are really quick. For the testing I was doing, the SSD was giving me approx. double the performance from some pretty expensive hardware (or at least it was 5-10 years ago…)

Also, take your test results with a grain of salt.

It’s another TSQL2sday post, this time hosted by Rob Volk (b | t ). Thanks for hosting Rob.

So this month, it’s about how we fixed a problem, or found help when we couldn’t fix a problem, with a theme based on ‘Help’ by The Beatles

I chose the 2nd verse…

When I was younger, so much younger than today

So, many years ago, when I started out with SQL Server, back in the heady days of 6.5, there was much less of a SQL Community, actually, I don’t even remember one. The only way I could get help, was either through using MSDN, or by emailed colleagues I met on a SQL training course.

I never needed anybody’s help in any way.

Though that’s primarily due to stopping using SQL for a while, just a year or so, but still.

Everyone needs help, at some point, with something. It’s not a weakness, it’s a strength.

But now these days are gone, I’m not so self assured.

In the past few years, I’ve started working more and more with SQL, and found that it is such a huge product that no one can know the whole thing (SSAS, SSIS, SSRS included), and because of that, I’ve found several ways to get help if I need it.

Though, before I get into that, I need to say something about the community. There is a huge SQL Community out there, though the first community event I attended wasn’t a SQL One. It was a Developer event, Remix Uk, back in 2008 (http://www.microsoft.com/uk/remix08/default.aspx). It was a great event and I got to meet some great people there, including Scott Guthrie! Getting to this event was pretty much solely due to an ex-colleague, Jes Kirkup. Thanks Jes!

Since then I’ve started attending community events where I can, including the local DevEvening events (where I’ve done a couple of short presentations), and SQL community events (SQLMaidenhead, SQL in the Evening, and SQLBits of course!). I’ve found that these are a great way of getting a great insight into what skills others in the industry have, and so where I should be targeting my learning. Following on from that, I’ve met some great people, and there are people who I know I could ask for help if I needed to.

Not to mention the #SQLHelp hash tag on twitter, where there is help, pretty much 24hours a day, the only restriction being the need to phrase your question in 150 characters (160-hash tag).

Now I find I’ve changed my mind and opened up the doors.

Now I find that I am helping people where I get the opportunity, am publishing blog articles (here, like this one!) and am hoping to do more Community presentations. Furthermore, I’m doing internal training courses (next month I’m doing one on SSAS), and have recently started mentoring a colleague in SQL.

It’s great to be able to share knowledge and experience.

Thanks for listening, and reading, and thanks again to Rob for hosting.

VS2012 Schema Comparison

Having recently been playing with the newly released Visual Studio 2012, one of the really nice features that I’ve seen is the Database Schema Comparison functionality.

If you’d like to follow along with this, you’ll need the ContosoBI database, which is available here.

This can be seen by launching VS2012, choosing New Project, and selecting the SQL Server Database Project. Don’t forget to give the Project a name, I called mine dbSchemaComparison.

image

When the Solution has been created, you’ll be presented with the Solution explorer.

image

In here, you’ll want to right-click on the Project name, and choose Import > Database. In here, you’ll need to create a new connection to your database. Also, if you are wanting to track everything, you need to check the Permissions and Database Settings tick boxes. Then click Start.

image

While the process is running, you’ll be presented with a dialog box showing the progress. When it’s completed, click Finish.

image

Now, when you look in the Solution Explorer, you’ll see a set of SQL Scripts that have been created to match the structure in the database.

image

My next step was to connect to the database using SQL Server Management Studio, and alter one of the tables. I decided to add an Index to the DimAccount table. The index was called ix_date, and I added the LoadDate field from the DimAccount table to it.

image

The final step in this process is to go back into Visual Studio, right click on the Project and choose Schema Compare. When this window opens, you have two drop down boxes. The left contains the Project that you have in VS2012; the right will need to be populated with a database for comparison.

image

When you’ve populated the database on the right, click the Compare button. The Schemas from the two projects are loaded and compared. The results are then displayed on the screen. As can be seen below, it’s pretty obvious what the differences between the environments are.

image

If you then want to sync the environments, you need to remember that you need to move the changes from Source (Left) to Target (Right). If you want to remove them from the Right (database), then you can click Update (or the script button next to it, to generate a script). Alternatively, if you want to update your project, you can click the ‘switch’ button between the two projects and rerun the Compare.

A really nice feature, I think you’ll agree.

Sleeplessness in Seattle

2012-08-12 11.40.57Over the past week, I’ve been attending the IE2 Course, held by SQLSkills, in Bellevue (near Seattle). It’s been a really intense week, covering a lot of really deep technical stuff.

However, I’m not going to talk about that. The benefits of training by some of the leading SQL Server people in the world should be obvious. Also, my poor brain needs time to assimilate everything that’s been hosed into it.

It has, however, been a great honour to spend time with the great people on this course, and I mean the other attendees (such as Kendra Little, Jes Borland, Tim Ford and Dan Taylor among others) as well as the Instructors (Paul Randal, Kimberly Tripp, Jonathan Kehayias and Joe Sack).

A couple of the most impressive nuggets of knowledge I’ve gained over the past week:

sys.login_token – Gives you a list of the Active Directory groups against a SQL login

SQLIO Analyzer – David Klee has written a website that will analyze the output from SQLIO

Adventure Works Workload Generator – Jon Kehayias has a SQL workload generator.

There are a great many other bits of knowledge I’ve gained, but these, so far, are the most immediate, quick wins, if you see what I mean…

It was a hard flight over here, 9.5 hours, on a plane that was an hour late departing, but I had the SQL Internals book to keep me occupied (between films, Marvel The Avengers, and The Hunger Games…).

I’d like to thank my employers, TAH Ltd (twitter|web), for sending me on the course, I hope that the benefits of this training, will continue to be obvious for many moons.

More importantly, I’d like to thank my wife, Emma, since without her support, I’d never have had the confidence to travel 4500 miles for a training course.

Thank you, to everyone on the course, for making it a great learning experience.

ps. Sleeplessness, since the majority of the week here, I woke up at 3am, almost every day, for no apparent reason.

Following on from my previous post on bringing data from Twitter into SQL Server, you’ll recall that we were able to take data from a JSON data feed, using cURL, and parse it into SQL Server, using a custom Function.

This enabled the twitter feed to be loaded into a SQL database at the staggering performance of 38 records a second. In a future post, I hope to do some optimisation on the T-SQL process for getting this data in.

However, I wanted to see how quickly the data could be loaded using a C# application (since I’m a bit of a closet coder).

So, following on from Step 1 and Step 2 from the previous post, so you’ve taken the feeds from Twitter, and loaded the data into a Load table in SQL, I then want to get the data into a table in SQL Server.

There are several JSON libraries that are available, that link to C# (a list is here: http://json.org/). I chose JSON.NET (http://json.codeplex.com/) on the basis that it was quite frequently mentioned on StackOverflow (so if I had any questions, I’d stand a reasonable chance of getting an answer), and the examples looked pretty straightforward.

Having fired up VS 2012, I created a project, and added JSON.Net into it using NuGet (I like NuGet!), using Install-Package Newtonsoft.Json.

using System;
using System.Collections.Generic;
using System.Linq;
using Newtonsoft.Json;
using System.Data.SqlClient;

namespace JSONParse
{

class Program
{
static void Main(string[] args)
{
DateTime dStart, dEnd;
dStart = DateTime.Now;

// Connect to SQL
SqlConnection conn = new SqlConnection(“server=localhost; Trusted_Connection=yes; database=twitter_stream;”);
try
{
conn.Open();
}
catch (Exception e)
{
Console.WriteLine(e.ToString());
}

try
{

// Set up the Reader process
SqlDataReader reader = null;
SqlCommand myCommand = new SqlCommand(
“select * from tweetJSON”, conn);
reader = myCommand.ExecuteReader();

while (reader.Read())
{
try
{
// Convert the JSON Data
dynamic obj = JsonConvert.DeserializeObject(reader[1].ToString());
string tJSID = reader[0].ToString();
Console.WriteLine(“id_str {0}”, obj.id_str);
SqlCommand insertcmd = new SqlCommand()
{
CommandText = @”insert into TweetJSONStaging ( tJSID, Country, id_str,
followers_count,profile_image_url,statuses_count,
profile_background_image_url,created_at,friends_count,
location,name,lang, screen_name, source, geo_loc, text)
VALUES (@tJSID, @country,@id_str,@followers, @profileURL, @statuses,
@backgroundimageurl, @created_at, @friendscount,@location, @name,
@lang, @screenname,@source, @geoloc, @text)”
};
insertcmd.Connection = new SqlConnection(“server=localhost; Trusted_Connection=yes; database=twitter_stream;”);
insertcmd.Parameters.AddWithValue(“@tJSID”, tJSID);
insertcmd.Parameters.AddWithValue(“@id_str”, (string)obj.id_str);
insertcmd.Parameters.AddWithValue(“@followers”, (string)obj.user.followers_count);
insertcmd.Parameters.AddWithValue(“@backgroundimageurl”, (string)obj.user.profile_background_image_url);
insertcmd.Parameters.AddWithValue(“@name”, (string)obj.user.name);
insertcmd.Parameters.AddWithValue(“@profileURL”, (string)obj.user.profile_image_url);
insertcmd.Parameters.AddWithValue(“@statuses”, (string)obj.user.statuses_count);
insertcmd.Parameters.AddWithValue(“@friendscount”, (string)obj.user.friends_count);
insertcmd.Parameters.AddWithValue(“@screenname”, (string)obj.user.screen_name);
insertcmd.Parameters.AddWithValue(“@lang”, (string)obj.user.lang);
insertcmd.Parameters.AddWithValue(“@text”, (string)obj.text);
insertcmd.Parameters.AddWithValue(“@source”, (string)obj.source);
insertcmd.Parameters.AddWithValue(“@created_at”, (string)obj.created_at);
string sCountry = “”, sLocation = “”, sGeoloc = “”;

try
{
sCountry = (string)obj.place.country;
}
catch (Exception e)
{
//Console.WriteLine(e.ToString());
}
insertcmd.Parameters.AddWithValue(“@country”, sCountry);

try
{
sLocation = (string)obj.user.location;
}
catch (Exception e)
{
//Console.WriteLine(e.ToString());
}
insertcmd.Parameters.AddWithValue(“@location”, sLocation);
try
{
sGeoloc = String.Join(“,”, obj.place.bounding_box.coordinates.Last.First);
}
catch (Exception e)
{
// Console.WriteLine(e.ToString());
}
insertcmd.Parameters.AddWithValue(“@geoloc”, sGeoloc);

insertcmd.Connection.Open();
try
{
insertcmd.ExecuteNonQuery();
}
catch (Exception)
{ }
insertcmd.Connection.Close();
}
catch (Exception)
{}
}
}
catch (Exception e)
{
Console.WriteLine(e.ToString());
}

dEnd = DateTime.Now;
Console.WriteLine(“Time taken = “+ dEnd.Subtract(dStart));

Console.ReadKey();

}
}
}

Obviously, this code is an example, and you’d be wanting to have proper error handling, better functional layout, better commenting, etc, etc, etc in there, however, you can see how the code runs, and from a performance perspective, it’s better.

Running this code through the Debug mode in Visual Studio, I managed to get 155 records processed a second, pretty snappy.

Then, running it as an Application, outside Visual Studio I got 393 records a second! Just over 10 times faster than T-SQL with a Cursor.

image

I’ve been looking at how it might be possible to bring data from Twitter into SQL Server.

You might ask, Why ????

Well, why not ? It’s more an exercise in how this could be done using tools that are available.

There are several steps that I went through, and I’m pretty sure there may be a better way, and if you can think of any improvements, then feel free to use the comments section below.

Step 1 – Getting the Tweets

First up, we need to get the Twitter data. There are numerous ways to do this, however, the easiest way I’ve found is to use a product called cURL (available here: http://curl.haxx.se/download.html).

I saw this referenced while investigating the Microsoft Hadoop on Azure site (https://www.windowsazure.com/en-us/develop/net/tutorials/hadoop-social-web-data/) which was used to extract data to feed into a Hive database.

There are three parts to obtaining the Twitter data using cURL.

Part 1 – Get cURL, you can download this using the link above. I used the Win64 Binary SSL version.

Part 2 – Create a parameters file. As the MS link above shows, the parameters file acts as a filter to get the data you want from the Twitter feed. While it is possible to filter the data by hashtags, I wanted to get a more generalised set of data. To do this, I put the following filter in the parameters file. This effectively filters the data by any tweets that are geotagged.

locations=-180,-90,180,90

Part 3 – Create a batch file to run the job. The batch file created is effectively the same as the one referenced in the MS link. The file is called GetTwitterStream.cmd, and contains the following text. You need to replace <twitterusername> and <twitterpassword> with your twitter credentials.

curl -d @twitter_params.txt -k https://stream.twitter.com/1/statuses/filter.json –u<twitterusername>:<twitterpassword> >>twitter_stream_seq.txt

When you run the GetTwitterStream.cmd file, it starts cURL and starts getting data from the public Twitter streaming API, as shown below.

image

This gives us a file containing the JSON feed from Twitter.

Step 2 – Load the Twitter JSON Data into SQL

Next we need to get the JSON data from Twitter into SQL. I created a Load table for this, with the following structure:

CREATE TABLE [dbo].[TweetJSON](
[JSONData] [varchar](8000) NULL,
[ID] [int] IDENTITY(1,1) NOT NULL,
[Processed] [char](1) NULL
) ON [PRIMARY]

Then, we can load the JSON file created from cURL in step 1, using BULK INSERT. We need a Format file for this, shown below, and called BIFormatFile.txt

9.0
1
1 SQLCHAR 0 8000 “\r\n” 1 [JSONData] “”

The data can then be loaded using this Bulk Insert task:

BULK INSERT [dbo].[TweetJSON]
from ‘c:\BigData\TwitterData\twitter_stream_seq.txt’
with (CODEPAGE=’RAW’, FORMATFILE=’C:\BigData\twitterdata\BIFormatFile.txt’)

So now, we have a table with the JSON data in, and an Identity column to give us an ID we can reference.

Step 3 – Parse the JSON

Phil Factor has written a great article (here http://www.simple-talk.com/sql/t-sql-programming/consuming-json-strings-in-sql-server/), which covers parsing JSON in T-SQL. I used the parseJSON function from this article, to extract the required fields from the Load table.

I created a staging table:

CREATE TABLE [dbo].[TweetJSONStaging](
[Country] [varchar](200) NULL,
[id_str] [varchar](200) NULL,
[followers_count] [int] NULL,
[profile_image_url] [varchar](200) NULL,
[statuses_count] [int] NULL,
[profile_background_image_url] [varchar](200) NULL,
[created_at] [datetime] NULL,
[friends_count] [int] NULL,
[location] [varchar](200) NULL,
[name] [varchar](200) NULL,
[lang] [varchar](200) NULL,
[screen_name] [varchar](200) NULL,

[varchar](200) NULL,
[geo_lat] [varchar](200) NULL,
[geo_long] [varchar](200) NULL
) ON [Staging]

Then used the following process to iterate through the data and get it into the right format. The process followed here is to create a Cursor (I’ll get to this in a minute) with the records to change, and call the ParseJSON function against it to split the fields out, then to get the fields we want and insert them into a table. Next we set the Processed flag, and repeat the process till there are no more records to process.

declare @JSON NVARCHAR(MAX), @ID int

declare jsCursor CURSOR FOR
select JSONData, ID from tweetJson where Processed is null

open jsCursor

FETCH NEXT from jsCursor into @JSON, @ID
while @@FETCH_STATUS=0
BEGIN
begin try
insert into TweetJSONStaging ( Country, id_str, followers_count,
profile_image_url,statuses_count,profile_background_image_url,created_at,
friends_count,location,name,lang, screen_name, source, geo_lat, geo_long)
select
max(case when NAME=’country’ then StringValue end) as Country,
max(case when NAME=’id_str’ then StringValue end) as id_str,
max(case when NAME=’followers_count’ then convert (int,StringValue) end)
as followers_count,
max(case when NAME=’profile_image_url’ then StringValue end)
as profile_image_url,
max(case when NAME=’statuses_count’ then convert(int,StringValue) end)
as statuses_count,
max(case when NAME=’profile_background_image_url’ then StringValue end)
as profile_background_image_url,
max(case when NAME=’created_at’ then convert(datetime,
(substring (StringValue,9,2)+’ ‘+substring (StringValue,5,3)+’ ‘+
substring (StringValue,27,4) +’ ‘+substring (StringValue,12,2) +’:’+
substring (StringValue,15,2)+’:’+substring (StringValue,18,2) ) ) end)
as created_at,
max(case when NAME=’friends_count’ then convert(int,StringValue) end)
as friends_count,
max(case when NAME=’location’ then StringValue end) as location,
max(case when NAME=’name’ then StringValue end) as name,
max(case when NAME=’lang’ then StringValue end) as lang,
max(case when NAME=’screen_name’ then StringValue end) as screen_name,
max(case when NAME=’source’ then StringValue end) as source,
max(case when element_id=’1′ then StringValue end) as geo_lat,
max(case when element_id=’2′ then StringValue end) as geo_long
from dbo.parseJSON( @JSON)

update tweetJSON
set Processed = ‘Y’
where ID=@ID

end try
begin catch
update tweetJSON
set Processed = ‘X’
where ID=@ID
end catch
FETCH NEXT from jsCursor into @JSON, @ID

end
close jsCursor
deallocate jsCursor

To allow this process to run in a reasonable amount of time, I created a couple of indexes on the load table (tweetJSON). The indexes are on the ID field (Clustered Index) and on the Processed flag.

CREATE UNIQUE CLUSTERED INDEX CI_ID ON [dbo].[TweetJSON]
( [ID] ASC ) ON [PRIMARY]

CREATE NONCLUSTERED INDEX NCI_Processed ON [dbo].[TweetJSON]
( [Processed] ASC ) ON [PRIMARY]

Running this process took approx. 26 seconds to load 1000 records, so approx. 38 records a second.

So, I thought I’d try it with a While clause, rather than a cursor, and interestingly, it took the same amount of time to run, for 1000 records.

Update: As raised by Dave Ballantyne (@davebally), this shows that a While clause is effectively doing the same as the Cursor, since the process is still running over records one by one. (Further information can be found here).

declare @JSON varchar(8000), @ID int, @count int

while 1=1
BEGIN
select top 1 @JSON = JSONData, @ID=ID from tweetJson where Processed =’N’
begin try
insert into TweetJSONStaging ( Country, id_str, followers_count,
profile_image_url,statuses_count,profile_background_image_url,created_at,
friends_count,location,name,lang, screen_name, source, geo_lat, geo_long)
select
max(case when NAME=’country’ then StringValue end) as Country,
max(case when NAME=’id_str’ then StringValue end) as id_str,
max(case when NAME=’followers_count’ then convert (int,StringValue) end)
as followers_count,
max(case when NAME=’profile_image_url’ then StringValue end)
as profile_image_url,
max(case when NAME=’statuses_count’ then convert(int,StringValue) end)
as statuses_count,
max(case when NAME=’profile_background_image_url’ then StringValue end)
as profile_background_image_url,
max(case when NAME=’created_at’ then convert(datetime,
(substring (StringValue,9,2)+’ ‘+substring (StringValue,5,3)+’ ‘+
substring (StringValue,27,4) +’ ‘+substring (StringValue,12,2) +’:’+
substring (StringValue,15,2)+’:’+substring (StringValue,18,2) ) ) end)
as created_at,
max(case when NAME=’friends_count’ then convert(int,StringValue) end)
as friends_count,
max(case when NAME=’location’ then StringValue end) as location,
max(case when NAME=’name’ then StringValue end) as name,
max(case when NAME=’lang’ then StringValue end) as lang,
max(case when NAME=’screen_name’ then StringValue end) as screen_name,
max(case when NAME=’source’ then StringValue end) as source,
max(case when element_id=’1′ then StringValue end) as geo_lat,
max(case when element_id=’2′ then StringValue end) as geo_long
from dbo.parseJSON( @JSON)

update tweetJSON
set Processed = ‘Y’
where ID=@ID

end try
begin catch
update tweetJSON
set Processed = ‘X’
where ID=@ID
end catch

select @count=count(1) from tweetJson where Processed =’N’

if @count=0
break
else
continue
end

Thanks for reading! I’ll add an update when I’ve made changes to make it more performant.