a:4:{s:5:"child";a:1:{s:0:"";a:1:{s:3:"rss";a:1:{i:0;a:6:{s:4:"data";s:3:"
";s:7:"attribs";a:1:{s:0:"";a:1:{s:7:"version";s:3:"2.0";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:1:{s:0:"";a:1:{s:7:"channel";a:1:{i:0;a:6:{s:4:"data";s:217:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:1:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:12:"Planet MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:27:"http://www.planetmysql.org/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 02 Jun 2010 07:00:01 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"language";a:1:{i:0;a:5:{s:4:"data";s:2:"en";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:42:"Planet MySQL - http://www.planetmysql.org/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"item";a:50:{i:0;a:6:{s:4:"data";s:78:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:42:"Disrupting IT with Open Source & Cloud";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:60:"tag:typepad.com,2003:post-6a00d83452e46469e2013482c906b0970c";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:54:"http://www.theopenforce.com/2010/06/disrupting-it.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:978:"A couple of weeks ago I gave a presentation at the Apache Lucene Eurocon in Prague. It was a good conference focused on Lucene/Solr open source search technology and sponsored by Lucid Imagination.
I've posted the bulk of the presentation below. (I omitted a couple of slides that were MySQL specific.) Even though it was a technical conference, I got positive feedback from the attendees and organizers that the information was useful in helping folks think about where to focus their efforts.
The slides have been posted to Box.net and are shown using their new "embedded preview" feature which is pretty cool. You can also use the short URL www.tinyurl.com/box-disr to view or download the slides in Powerpoint format.
Thanks to the folks at Lucid Imagination as well as those who gave input and feedback on the presentation.
Conference: Apache Lucene Eurocon, Agenda, Training
Lucid Imagination: Main site, Blog, Training, Services
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 02 Jun 2010 04:46:05 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:8:{i:0;a:5:{s:4:"data";s:8:"Business";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:6:"Apache";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:10:"conference";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:10:"Disruption";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:7:"eurocon";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:6:"Lucene";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:6:"prague";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:7;a:5:{s:4:"data";s:4:"Solr";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:2153:"
A couple of weeks ago I gave a presentation at the Apache Lucene Eurocon in Prague. It was a good conference focused on Lucene/Solr open source search technology and sponsored by Lucid Imagination.
I've posted the bulk of the presentation below. (I omitted a couple of slides that were MySQL specific.) Even though it was a technical conference, I got positive feedback from the attendees and organizers that the information was useful in helping folks think about where to focus their efforts.
The slides have been posted to Box.net and are shown using their new "embedded preview" feature which is pretty cool. You can also use the short URL www.tinyurl.com/box-disr to view or download the slides in Powerpoint format.
Thanks to the folks at Lucid Imagination as well as those who gave input and feedback on the presentation.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:13:"Zack Urlocker";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:1;a:6:{s:4:"data";s:68:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:42:"Under-provisioning: the curse of the cloud";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:33:"http://www.xaprb.com/blog/?p=1884";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:79:"http://www.xaprb.com/blog/2010/06/01/under-provisioning-the-curse-of-the-cloud/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:5016:"A common problem I see people running into when using a cloud computing service is the trap of under-provisioning. There’s a chain effect that leads to this result: 1) people don’t understand how virtualization works, and therefore 2) they don’t realize how much of a computing resource they’re really buying, so 3) they assume they are entitled to more than they really are, and 4) they under-provision. A few other causes and effects come into play here, too. For example, the choice to use the cloud is sometimes founded on economic assumptions that frequently turn out to be wrong. The cloud service looks more economically attractive than it really is, due to under-provisioning.
Let’s get back to this idea that people under-provision. How do I know that’s happening? I’ll use anecdotal evidence to illustrate. Here’s a real quote from a recent engagement about database (MySQL) performance problems:
Do you think it’s likely that the underlying hardware is simply worse than average? If you think this will be an ongoing problem, maybe we should try our luck with a new instance/storage cluster?
The fundamental assumption here is that some clusters are overloaded and are giving poor quality of service. We’re trained to think this way because we are familiar with services such as shared hosting, where other users on your particular server might really be abusive and claim resources that should be yours. But this isn’t how virtualization works in the common cloud platforms. In these platforms, you aren’t sharing resources with other users. You are guaranteed to get what you deserve! No kidding — this actually works.
If that’s true, then why does performance fluctuate so much? The answer lies in how resources are parceled out. Assume there are 10 units of computing resources, and you’re paying for one of them. You buy 1/10th of the machine’s power. But it just happens that you’re the only virtual instance running on that physical server. You fire up an intense job. How much power do you get? You paid for 1 unit, but you get 10, because no one else is using the other 9 units. This is the way most virtualization platforms work: they give you extra resources if they’re available and not being claimed by anyone else’s instance. This guarantees that you’ll never get less than you deserve, but it leaves open the possibility that you’ll get more than you deserve. (What would be the point of wasting that power, really?) Under-provisioning is the obverse of over-providing, which is what the virtualization platform does.
First-generation hyperthreading gave the same illusion of more resources than are really available, by the way. It made you think there were multiple processors, when in fact there weren’t — there were multiple sets of registers. Hyperthreading is a form of virtualization, too.
What typically happens is that people are running their cloud instances on machines whose underlying physical hardware is not fully utilized, and they get used to a certain level of performance they’re not really paying for. Alas, you can’t really know whether this is happening or not! But it surely is in many (most?) cases, which is why occasionally you get some resource that seems much slower than you’re accustomed to, and you think it’s “too slow.” Not so. Your other units are “too fast.”
I have a theory that if you really knew the true capacity you were buying, you’d view the price-to-performance ratio much less favorably. But it’s almost impossible to know that, really; it doesn’t help that the cloud service providers are rather vague about how much power a certain instance size really gives you. (They aren’t being malicious; it’s just the way virtualization works.) Under-provisioning is almost forced on users because they have no alternative — you could plan for worst-case performance, and you’d be doing the right thing, but how will you ever know you’ve really hit rock bottom and the worst case is really no worse? How can you even benchmark and do proper capacity planning, if you don’t know what you’re benchmarking? This should really give you serious pause. You should be thinking “wait, I’m basing my capacity planning and provisioning on luck and the law of large numbers. What if my luck runs out and I get a Black Swan event?” The question is not “what if,” but “when.”
I also think that the lack of transparency encourages people to use cloud computing services for the wrong reasons altogether. I could write about this, but I think Theo Schlossnagle said it pretty well already.
Related posts:Drizzle stops the rainA review of The Art of Capacity Planning by John AllspawWhy MySQL might not benefit from having a mother ship";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 02 Jun 2010 01:31:46 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:6:{i:0;a:5:{s:4:"data";s:10:"Commentary";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:10:"PostgreSQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:3:"SQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:15:"Cloud Computing";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:17:"Theo Schlossnagle";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:14:"virtualization";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:6004:"A common problem I see people running into when using a cloud computing service is the trap of under-provisioning. There’s a chain effect that leads to this result: 1) people don’t understand how virtualization works, and therefore 2) they don’t realize how much of a computing resource they’re really buying, so 3) they assume they are entitled to more than they really are, and 4) they under-provision. A few other causes and effects come into play here, too. For example, the choice to use the cloud is sometimes founded on economic assumptions that frequently turn out to be wrong. The cloud service looks more economically attractive than it really is, due to under-provisioning.
Let’s get back to this idea that people under-provision. How do I know that’s happening? I’ll use anecdotal evidence to illustrate. Here’s a real quote from a recent engagement about database (MySQL) performance problems:
Do you think it’s likely that the underlying hardware is simply worse than average? If you think this will be an ongoing problem, maybe we should try our luck with a new instance/storage cluster?
The fundamental assumption here is that some clusters are overloaded and are giving poor quality of service. We’re trained to think this way because we are familiar with services such as shared hosting, where other users on your particular server might really be abusive and claim resources that should be yours. But this isn’t how virtualization works in the common cloud platforms. In these platforms, you aren’t sharing resources with other users. You are guaranteed to get what you deserve! No kidding — this actually works.
If that’s true, then why does performance fluctuate so much? The answer lies in how resources are parceled out. Assume there are 10 units of computing resources, and you’re paying for one of them. You buy 1/10th of the machine’s power. But it just happens that you’re the only virtual instance running on that physical server. You fire up an intense job. How much power do you get? You paid for 1 unit, but you get 10, because no one else is using the other 9 units. This is the way most virtualization platforms work: they give you extra resources if they’re available and not being claimed by anyone else’s instance. This guarantees that you’ll never get less than you deserve, but it leaves open the possibility that you’ll get more than you deserve. (What would be the point of wasting that power, really?) Under-provisioning is the obverse of over-providing, which is what the virtualization platform does.
First-generation hyperthreading gave the same illusion of more resources than are really available, by the way. It made you think there were multiple processors, when in fact there weren’t — there were multiple sets of registers. Hyperthreading is a form of virtualization, too.
What typically happens is that people are running their cloud instances on machines whose underlying physical hardware is not fully utilized, and they get used to a certain level of performance they’re not really paying for. Alas, you can’t really know whether this is happening or not! But it surely is in many (most?) cases, which is why occasionally you get some resource that seems much slower than you’re accustomed to, and you think it’s “too slow.” Not so. Your other units are “too fast.”
I have a theory that if you really knew the true capacity you were buying, you’d view the price-to-performance ratio much less favorably. But it’s almost impossible to know that, really; it doesn’t help that the cloud service providers are rather vague about how much power a certain instance size really gives you. (They aren’t being malicious; it’s just the way virtualization works.) Under-provisioning is almost forced on users because they have no alternative — you could plan for worst-case performance, and you’d be doing the right thing, but how will you ever know you’ve really hit rock bottom and the worst case is really no worse? How can you even benchmark and do proper capacity planning, if you don’t know what you’re benchmarking? This should really give you serious pause. You should be thinking “wait, I’m basing my capacity planning and provisioning on luck and the law of large numbers. What if my luck runs out and I get a Black Swan event?” The question is not “what if,” but “when.”
I also think that the lack of transparency encourages people to use cloud computing services for the wrong reasons altogether. I could write about this, but I think Theo Schlossnagle said it pretty well already.
Related posts:
- Drizzle stops the rain
- A review of The Art of Capacity Planning by John Allspaw
- Why MySQL might not benefit from having a mother ship
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:22:"Baron Schwartz (xaprb)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:2;a:6:{s:4:"data";s:48:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:47:"Check how old your MySQL books are before usage";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:70:"tag:blogger.com,1999:blog-7603704315097619422.post-3774987396997346416";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:76:"http://geert.vanderkelen.org/2010/06/check-how-old-your-mysql-books-are.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:725:"This is a friendly reminder to check the publication date and discussed version you MySQL books before starting out hacking or even posting about limitations. Lots of old books are still going around. Maybe it's good to destroy them rather than giving them to students or newbies.Few days ago (28 May 2010), for example, we had a word-for-word copy of a book on a blog post (now removed) which was discussing MySQL Cluster limitations from years ago. Well, it was funny at first and we had a good laugh. But it's a bit worrisome. My colleague Matthew posted a rebuttal post.How would you recycle the old, technical books? It's not worth giving them to public libraries, it's maybe unhealthy to burn them? How would you do it?";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 01 Jun 2010 18:24:24 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:2:{i:0;a:5:{s:4:"data";s:7:"opinion";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:1300:"This is a friendly reminder to check the publication date and discussed version you MySQL books before starting out hacking or even posting about limitations. Lots of old books are still going around. Maybe it's good to destroy them rather than giving them to students or newbies.
Few days ago (28 May 2010), for example, we had a word-for-word copy of a book on a blog post (now removed) which was discussing MySQL Cluster limitations from years ago. Well, it was funny at first and we had a good laugh. But it's a bit worrisome. My colleague Matthew posted a rebuttal post.
How would you recycle the old, technical books? It's not worth giving them to public libraries, it's maybe unhealthy to burn them? How would you do it?
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:17:"Geert Vanderkelen";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:3;a:6:{s:4:"data";s:63:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:34:"Eventually consistent Group Commit";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:38:"http://ronaldbradford.com/blog/?p=2836";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:77:"http://ronaldbradford.com/blog/eventually-consistent-group-commit-2010-06-01/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:1062:"Having just written an interview response about NoSQL concepts for a RDBMS audience it was poetic that an inconspicuous title “(4 of 3)” highlights that both a MySQL read scalable implementation via replication and a NoSQL solution can share a common lack of timely consistency of data. For the sake of Group Commit I hope my data is always consistent at some location at some point in time as soon as possible.
In attempting to comment to Kristian Nielsen’s Fixing MySQL group commit (part 4 of 3) I was forced to watch an ad before I could even add a comment. Go jump Live Journal, it’s quicker to write my own blog post.
And if anybody is still reading, I had just written the following.
“There is clearly a place for NoSQL solutions. The two primary types of products are a key/value store and a schema-less solution. You need to learn the strengths, benefits and weaknesses of both. For a RDBMS resource the lack of transactions, the lack of joins and the concept of eventually consistent can take some time to accept.”";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 01 Jun 2010 17:36:53 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:5:{i:0;a:5:{s:4:"data";s:9:"Databases";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:12:"Professional";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:11:"consistency";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:5:"nosql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:1374:"Having just written an interview response about NoSQL concepts for a RDBMS audience it was poetic that an inconspicuous title “(4 of 3)” highlights that both a MySQL read scalable implementation via replication and a NoSQL solution can share a common lack of timely consistency of data. For the sake of Group Commit I hope my data is always consistent at some location at some point in time as soon as possible.
In attempting to comment to Kristian Nielsen’s Fixing MySQL group commit (part 4 of 3) I was forced to watch an ad before I could even add a comment. Go jump Live Journal, it’s quicker to write my own blog post.
And if anybody is still reading, I had just written the following.
“There is clearly a place for NoSQL solutions. The two primary types of products are a key/value store and a schema-less solution. You need to learn the strengths, benefits and weaknesses of both. For a RDBMS resource the lack of transactions, the lack of joins and the concept of eventually consistent can take some time to accept.”
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:15:"Ronald Bradford";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:4;a:6:{s:4:"data";s:63:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:48:"MySQL Connector/Net 6.3.2 beta has been released";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:35:"http://www.reggieburnett.com/?p=331";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:116:"http://feedproxy.google.com/~r/ReggiesRamblings-Mysql/~3/nU156uxFnNQ/mysql-connectornet-6-3-2-beta-has-been-released";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:1064:"Sorry for this not getting out sooner. Connector/Net 6.3.2 was released several days ago but somehow the release announcement was never made.
MySQL Connector/Net 6.3.2, a new version of the all-managed .NET driver for MySQL has been released. This is a beta release and is intended to introduce you to the new features and enhancements we are planning. This
release should not be used in a production environment. It is now available in source and binary form from
[http://dev.mysql.com/downloads/connector/net/6.3.html] and mirror sites (note that not all mirror sites may be up to date at this point of time - if you can’t find this version on some mirror, please try again later
or choose another download site.)
The new features or changes in this release are:
Visual Studio 2010 RTM support
New sql editor. Create a new file with a .mysql extension to see it in action
What we know may be broken
Documentation is not updated yet and is not integrated into VS 2010
Please let us know what else we broke and how we can make it better!
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 01 Jun 2010 15:00:08 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:5:{i:0;a:5:{s:4:"data";s:4:".NET";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:10:"Technology";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:8:".NET 4.0";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:2:"C#";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:1982:"Sorry for this not getting out sooner. Connector/Net 6.3.2 was released several days ago but somehow the release announcement was never made.
MySQL Connector/Net 6.3.2, a new version of the all-managed .NET driver for MySQL has been released. This is a beta release and is intended to introduce you to the new features and enhancements we are planning. This
release should not be used in a production environment. It is now available in source and binary form from
[http://dev.mysql.com/downloads/connector/net/6.3.html] and mirror sites (note that not all mirror sites may be up to date at this point of time - if you can’t find this version on some mirror, please try again later
or choose another download site.)
The new features or changes in this release are:
- Visual Studio 2010 RTM support
- New sql editor. Create a new file with a .mysql extension to see it in action
What we know may be broken
Documentation is not updated yet and is not integrated into VS 2010
Please let us know what else we broke and how we can make it better!



PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:14:"Reggie Burnett";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:5;a:6:{s:4:"data";s:63:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:58:"mk-query-digest Tips – Showing all hosts & users";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:38:"http://ronaldbradford.com/blog/?p=2824";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:87:"http://ronaldbradford.com/blog/mk-query-digest-tips-showing-all-hosts-users-2010-06-01/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:1976:"The Maatkit tools provide a suite of additional MySQL commands. There is one command I use constantly and that is mk-query-digest.
Unfortunately the documentation does leave a lot to be desired for usability. While throughout, it is a man page and not a user guide. Several of us have discussed writing better documentation however it’s always a matter of time. I have however learned a number of tips and I’d like to share them in smaller digests.
The first is showing additional display. Maatkit works on truncating per line output to a reasonable length of 73 characters?
One of those lines is the list of hosts that connected to MySQL for a query, for example.
# Hosts 4 192.168.40... (2), 192.168.40... (2)... 2 more
# Hosts 3 99.99.245.14 (12), 999.106.206.167 (6)... 1 more
The problem is I want to know what that 1 more is so I can gather a complete list of IP addresses that connect to this server. You do that with the –show-all=host argument.
Without
$ cat external.tcpdump | ./mk-query-digest --type tcpdump | grep Hosts | uniq -c
#
1 # Hosts 3 99.99.245.14 (12), 999.106.206.167 (6)... 1 more
1 # Hosts 1 99.99.139.140
With
$ cat external.tcpdump | ./mk-query-digest --type tcpdump --show-all=host | grep Hosts | uniq -c
1 # Hosts 3 99.99.245.14 (12), 999.106.206.167 (6), 99.99.139.140 (2)
1 # Hosts 1 99.99.139.140
You can apply the same principle to the Users as well with –show-all=user
$ cat external.tcpdump | ./mk-query-digest --type tcpdump --show-all=user | grep Users | uniq -c
1 # Users 2 xxx (13), phpmysqlmo... (5)
49 # Users 1 xxx
The problem is a still gett a truncation of the name ‘phpmysqlmo…’ That’s the one thing I’m trying to uncover, because that IP and usernme are not valid permissions for this system.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 01 Jun 2010 14:19:52 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:5:{i:0;a:5:{s:4:"data";s:9:"Databases";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:12:"Professional";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:7:"maatkit";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:15:"mk-query-digest";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:2432:"The Maatkit tools provide a suite of additional MySQL commands. There is one command I use constantly and that is mk-query-digest.
Unfortunately the documentation does leave a lot to be desired for usability. While throughout, it is a man page and not a user guide. Several of us have discussed writing better documentation however it’s always a matter of time. I have however learned a number of tips and I’d like to share them in smaller digests.
The first is showing additional display. Maatkit works on truncating per line output to a reasonable length of 73 characters?
One of those lines is the list of hosts that connected to MySQL for a query, for example.
# Hosts 4 192.168.40... (2), 192.168.40... (2)... 2 more
# Hosts 3 99.99.245.14 (12), 999.106.206.167 (6)... 1 more
The problem is I want to know what that 1 more is so I can gather a complete list of IP addresses that connect to this server. You do that with the –show-all=host argument.
Without
$ cat external.tcpdump | ./mk-query-digest --type tcpdump | grep Hosts | uniq -c
#
1 # Hosts 3 99.99.245.14 (12), 999.106.206.167 (6)... 1 more
1 # Hosts 1 99.99.139.140
With
$ cat external.tcpdump | ./mk-query-digest --type tcpdump --show-all=host | grep Hosts | uniq -c
1 # Hosts 3 99.99.245.14 (12), 999.106.206.167 (6), 99.99.139.140 (2)
1 # Hosts 1 99.99.139.140
You can apply the same principle to the Users as well with –show-all=user
$ cat external.tcpdump | ./mk-query-digest --type tcpdump --show-all=user | grep Users | uniq -c
1 # Users 2 xxx (13), phpmysqlmo... (5)
49 # Users 1 xxx
The problem is a still gett a truncation of the name ‘phpmysqlmo…’ That’s the one thing I’m trying to uncover, because that IP and usernme are not valid permissions for this system.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:15:"Ronald Bradford";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:6;a:6:{s:4:"data";s:53:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:23:"On Good Instrumentation";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:43:"http://www.mysqlperformanceblog.com/?p=2912";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:71:"http://www.mysqlperformanceblog.com/2010/05/31/on-good-instrumentation/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:6572:"In so many cases troubleshooting applications I keep thinking how much more efficient things could be going if only there would be a good instrumentation available. Most of applications out there have very little code to help understand what is going on and if it is there it is frequently looking at some metrics which are not very helpful.
If you look at the system from bird eye view – system needs to process transactions and you want it to successfully complete large number of transactions it gets (this is what called availability) and we want it to serve them with certain response time, which is what is called performance. There could be many variables in environment which change – load, number of concurrent users, database, the way users use the system but in the nutshell all what you really care is having predictable response time within certain range. So if we care about response time – this is exactly what our instrumentation should measure
Response Time Summary We want to understand where exactly response time comes from. For example if we define transaction as the time it took to generate HTML page we want to understand how much time was spent waiting on the database, memcache, other external services, as well as how much CPU time it consumed.
Now what is important we need this information for individual transactions. It may be every transaction which is best and easily achievable for small-medium systems or at least for large enough sample. It is very important this information is available for individual transactions not the average. Average is useless because 100 transactions taking 1 sec and 99 transactions having 1ms and 1 taking 99.1 sec will have the same average while for sake of performance analyzes these are completely different. When you have transaction sample make sure it contains fair population of transaction – getting only transactions which are slow is not helpful as we might want to compare them to the fast transactions to understand why they are slow.
What kind of components do you need to have in response time summary – all components which are significant enough. If your instrumentation has 95% of response time unaccounted for it is useless. You also want blocks not to mix apples with oranges. For example “mysql and memcache” block would not be helpful. Even further I would prefer to split “mysql time” in the “connect time” and “query time” as there are situations when one but not other would be affected.
In is important for response time summary stored in the logs which are easy to query so you can analyze data in a lot of various ways. Sometimes you may find the response time is impacted by queries from certain user, in others it may be attributed to different application/web server.
The goal for Response Time Summary is to quickly point direction where problem happens. Whenever you have spike in response time or it is bad response time for certain kind of request you can quickly understand where does it come from ? Is it wasted CPU time Slow response from MySQL or Memcache.
I also like to see numbers of calls stored together with attributed response time. For example I’d like to see number of mysql calls in addition to MySQL response time. This helps to understand if it is the issue with number of queries or their performance. If I see 2 queries taking 30 seconds it is clearly slow queries. If it is 10.000 queries executed and total response time is 4 sec I know it is pretty much as good as it gets with standard Ethernet network and finding a ways to reduce number of queries is going to be more helpful.
The Glue Our applications involve multiple layers and typically higher layer can only report response time it took to call lower layer, but not the reason for that response time. For example we can report time it took to execute SQL query from PHP application side, but we can’t say why it has taken so long. Was it row level lock ? waiting on disk IO or was it simply question of burning a lot of CPU. On the other hand this information may be available in the instrumentation stats from that lower layer – for example in MySQL Query Log. What is important is however to be able to connect the data from these logs – glue them together. The easy way to do it is to provide an unique identifier to all requests and put it in the logs with request of the lower levels. With mySQL the simple way to do it is to put it in the comments for queries you execute.
Optional Tracing The information in lower layers logs is very helpful however it typically have two problems. First not every layer has good logs. For example if you’re running memcached you probably do not have the logs detailing all requests and their response time. Second – the lower layer may only know response time from its vantage point, which in many cases does not include network communication time which can be very important.
Tracing should be optional and normally applied to the small sample of requests, though it needs to be detailed. Typically you would include the calls to the lower level services together with timestamp, the response summary with timestamp again. The information about request has to be complete enough to identify target action and response completely. For example if I’m speaking about memcache I’d like to know which server:port request was issued to which key was requested, and on response I’d like to know if it was hit, miss or error.
The way I use it may be as follows. I see the increased response time for given kind of request. I see response time is coming from MySQL. I check the number of MySQL Queries and it is 5x when it usually is for this kind of request. Looking at memcache stats I can see high number of misses. Looking at some available traces shows the server memcache01 has very high miss ratio. Checking what is going on with memcache01 shows it just was restarted (and hence has almost empty cache). This is important example as it shows your increased response time from MySQL may not have anything to do with MySQL itself but you would not know unless you’re capturing the right data.
If you’re looking for nice example framework for instrumentation, check out instrumentation for PHP – It has everything mentioned by tracing which is trivial to add.
Entry posted by peter |
One comment
Add to: | | | | ";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Mon, 31 May 2010 21:25:44 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:3:{i:0;a:5:{s:4:"data";s:9:"memcached";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:10:"production";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:8707:"In so many cases troubleshooting applications I keep thinking how much more efficient things could be going if only there would be a good instrumentation available. Most of applications out there have very little code to help understand what is going on and if it is there it is frequently looking at some metrics which are not very helpful.
If you look at the system from bird eye view – system needs to process transactions and you want it to successfully complete large number of transactions it gets (this is what called availability) and we want it to serve them with certain response time, which is what is called performance. There could be many variables in environment which change – load, number of concurrent users, database, the way users use the system but in the nutshell all what you really care is having predictable response time within certain range. So if we care about response time – this is exactly what our instrumentation should measure
Response Time Summary We want to understand where exactly response time comes from. For example if we define transaction as the time it took to generate HTML page we want to understand how much time was spent waiting on the database, memcache, other external services, as well as how much CPU time it consumed.
Now what is important we need this information for individual transactions. It may be every transaction which is best and easily achievable for small-medium systems or at least for large enough sample. It is very important this information is available for individual transactions not the average. Average is useless because 100 transactions taking 1 sec and 99 transactions having 1ms and 1 taking 99.1 sec will have the same average while for sake of performance analyzes these are completely different. When you have transaction sample make sure it contains fair population of transaction – getting only transactions which are slow is not helpful as we might want to compare them to the fast transactions to understand why they are slow.
What kind of components do you need to have in response time summary – all components which are significant enough. If your instrumentation has 95% of response time unaccounted for it is useless. You also want blocks not to mix apples with oranges. For example “mysql and memcache” block would not be helpful. Even further I would prefer to split “mysql time” in the “connect time” and “query time” as there are situations when one but not other would be affected.
In is important for response time summary stored in the logs which are easy to query so you can analyze data in a lot of various ways. Sometimes you may find the response time is impacted by queries from certain user, in others it may be attributed to different application/web server.
The goal for Response Time Summary is to quickly point direction where problem happens. Whenever you have spike in response time or it is bad response time for certain kind of request you can quickly understand where does it come from ? Is it wasted CPU time Slow response from MySQL or Memcache.
I also like to see numbers of calls stored together with attributed response time. For example I’d like to see number of mysql calls in addition to MySQL response time. This helps to understand if it is the issue with number of queries or their performance. If I see 2 queries taking 30 seconds it is clearly slow queries. If it is 10.000 queries executed and total response time is 4 sec I know it is pretty much as good as it gets with standard Ethernet network and finding a ways to reduce number of queries is going to be more helpful.
The Glue Our applications involve multiple layers and typically higher layer can only report response time it took to call lower layer, but not the reason for that response time. For example we can report time it took to execute SQL query from PHP application side, but we can’t say why it has taken so long. Was it row level lock ? waiting on disk IO or was it simply question of burning a lot of CPU. On the other hand this information may be available in the instrumentation stats from that lower layer – for example in MySQL Query Log. What is important is however to be able to connect the data from these logs – glue them together. The easy way to do it is to provide an unique identifier to all requests and put it in the logs with request of the lower levels. With mySQL the simple way to do it is to put it in the comments for queries you execute.
Optional Tracing The information in lower layers logs is very helpful however it typically have two problems. First not every layer has good logs. For example if you’re running memcached you probably do not have the logs detailing all requests and their response time. Second – the lower layer may only know response time from its vantage point, which in many cases does not include network communication time which can be very important.
Tracing should be optional and normally applied to the small sample of requests, though it needs to be detailed. Typically you would include the calls to the lower level services together with timestamp, the response summary with timestamp again. The information about request has to be complete enough to identify target action and response completely. For example if I’m speaking about memcache I’d like to know which server:port request was issued to which key was requested, and on response I’d like to know if it was hit, miss or error.
The way I use it may be as follows. I see the increased response time for given kind of request. I see response time is coming from MySQL. I check the number of MySQL Queries and it is 5x when it usually is for this kind of request. Looking at memcache stats I can see high number of misses. Looking at some available traces shows the server memcache01 has very high miss ratio. Checking what is going on with memcache01 shows it just was restarted (and hence has almost empty cache). This is important example as it shows your increased response time from MySQL may not have anything to do with MySQL itself but you would not know unless you’re capturing the right data.
If you’re looking for nice example framework for instrumentation, check out instrumentation for PHP – It has everything mentioned by tracing which is trivial to add.
Entry posted by peter |
One comment
Add to:
|
|
|
| 
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:22:"MySQL Performance Blog";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:7;a:6:{s:4:"data";s:48:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:2:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:71:"How To Set Up WebDAV With MySQL Authentication On Apache2 (Ubuntu 9.10)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:95:"http://www.howtoforge.com/how-to-set-up-webdav-with-mysql-authentication-on-apache2-ubuntu-9.10";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:95:"http://www.howtoforge.com/how-to-set-up-webdav-with-mysql-authentication-on-apache2-ubuntu-9.10";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:509:"
How To Set Up WebDAV With MySQL Authentication On Apache2 (Ubuntu
9.10)
This guide explains how to set up WebDAV with MySQL authentication
(using mod_auth_mysql) on Apache2 on an Ubuntu 9.10 server. WebDAV
stands for Web-based Distributed Authoring and Versioning and
is a set of extensions to the HTTP protocol that allow users to directly
edit files on the Apache server so that they do not need to be
downloaded/uploaded via FTP. Of course, WebDAV can also be used to
upload and download files.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Mon, 31 May 2010 15:11:35 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:3:{i:0;a:5:{s:4:"data";s:6:"Ubuntu";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:6:"Apache";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:988:"
How To Set Up WebDAV With MySQL Authentication On Apache2 (Ubuntu
9.10)
This guide explains how to set up WebDAV with MySQL authentication
(using mod_auth_mysql) on Apache2 on an Ubuntu 9.10 server. WebDAV
stands for Web-based Distributed Authoring and Versioning and
is a set of extensions to the HTTP protocol that allow users to directly
edit files on the Apache server so that they do not need to be
downloaded/uploaded via FTP. Of course, WebDAV can also be used to
upload and download files.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:8;a:6:{s:4:"data";s:63:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:39:"Fixing MySQL group commit (part 4 of 3)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:49:"http://kristiannielsen.livejournal.com/12810.html";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:49:"http://kristiannielsen.livejournal.com/12810.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:8900:"
(No
three-part series is complete without a part 4, right?)
Here is an analogy that describes well what group commit does. We have a bus
driving back and forth transporting people from A to B (corresponding
to fsync() "transporting" commits to durable storage on
disk). The group commit optimisation is to have the bus pick up everyone that
is waiting at A before driving to B, not drive people one by one. Makes sense,
huh? :-)
It is pretty obvious that this optimisation of having more than one person in
the bus can dramatically improve throughput, and it is the same for the group
commit optimisation. Here is a graph from a benchmark comparing stock MariaDB
5.1 vs. MariaDB patched with a proof-of-concept patch that enables group
commit:
When group commit is implemented, we see clearly how performance (measured in
queries per second) scales dramatically as the number of threads
increases. Whereas with stock MariaDB with no group commit, there is no
scaling at all. We also see that SSD is better than HDD (no surprise there),
but that with sufficient parallelism from the application, group commit can to
a large extent compensate for the slower disks.
This is the same benchmark as in
the first part
of the series. Binlog is enabled. Durability is enabled
with sync_binlog=1 and flush_log_at_trx_commit=1
(and disk cache disabled to prevent the disks lying about when data is
durable). The load is single-row transactions against a 1000000-row XtraDB
table. The benchmark is thus specifically designed to make
the fsync() calls at the end of commit the bottleneck.
I should remark that I did not really tune the servers used in the benchmark
for high parallelism (except for raising max_connections :-), and
I ran the client on the same machine as the server. So it is likely that there
are other effects than group commit influencing the performance at high
parallelism (especially on the SSD results, which I ran on my laptop). But I
just wanted to see if my group commit work scales with higher parallelism, and the
graphs clearly shows that it does!
Architecture
For this work, I have focused a lot on the API for storage engine and binlog
plugins (we do not have binlog plugins now, but this is something that we will
be working on in MariaDB later this year). I want a clean interface that
allows plugins to implement group commit in a simple and efficient manner.
A crucial point is the desire to get commits ordered the same way in the
different engines (ie. typically in InnoDB and in the binlog), as I discussed
in previous articles. As group commit is about parallelims, and ordering is
about serialisation, these two tend to get into conflict. My idea is to
introduce new calls in the interface to storage engines and the XA transaction
coordinator (which is how binlog interacts with commit internally in the
server). These new calls allow plugins that care about commit order to
cooperate on getting correct ordering without getting in each others way and
killing parallelims. Plugins that do not need any ordering can ignore the
new calls, which are optional (for example the transaction coordinator that
runs when the binlog is disabled does not need any ordering).
The full architecture is written up in detail in the
MariaDB Worklog#116.
But the basic idea is to introduce a new handlerton method:
void commit_ordered(handlerton *hton, THD *thd, bool all);
This is called just prior to the normal commit() method, and is
guaranteed to run in the same commit order across all engines (and binlog)
participating in the transaction.
This allows for a lot of flexibility in plugin implementations. A typical
implementation would in the commit_ordered() method write the
transaction data into its in-memory log buffer, and delay the
time-consuming write() and fsync() to the
parallel commit() method. InnoDB/XtraDB is already structured in
this way, so fits very well into this scheme.
But if an engine wants to use another approach, for example a
ticket-based approach
as Mark
and Mats
suggested, that is easy to do too. Just allocate the ticket
in commit_ordered(), and use it in commit(). I
believe most approaches should fit in well with the proposed model.
I also added a corresponding prepare_ordered() call, which runs
in commit order during the prepare phase. The intension is to provide a place
to release InnoDB row locks early for even better performance, though I still
need to get the Facebook people to explain exactly what they want to do in
this respect ;-)
I also spent a lot of thought on getting efficient inter-thread
synchronisation in the
archtecture. As Mats
mentioned, if one is not careful, it is easy to end up
with O(N2) cost of thread wake-up, with N
the number of transactions participating in group commit. As the goal is to get N
as high as possible to maximise sharing of the expensive fsync()
call, such O(N2) cost is to be avoided.
In the architecture
described in MariaDB
Worklog#116, there should in the normal case only be a single highly
contested lock, the one on the binlog group commit (which is inherent to the
idea of group commit, one thread does the fsync() while the rest
of participating threads wait). I use a lock-free queue to make threads
in prepare_ordered() not block threads
in commit_ordered() and vice
versa. The prepare_ordered() calls runs under a global lock, but
as they are intended to execute very quickly there should ideally be little contention
here. The commit_ordered() calls run in a loop in a single
thread, also avoiding serious lock contention as long as commit_ordered() runs
quickly as intended.
In particular, running the commit_ordered() loop in a single
thread for each group commit avoids high cost of thread wake-up. If we were to
try to run the sequential part of commit in different threads in a specific
commit order, we would need to switch execution context from one thread to the
next, bouncing the thread of control all over the cores in an SMP
system. Which takes lots of context switches, and could potentially be
costly. In the proposed architecture, a single thread runs
all commit_ordered() method calls and wakes up the other waiting
threads individually, each free to proceed immediately without any more
waiting for one another.
Of course, an engine/binlog plugin that so desires is free to implement such
thread-hopping itself, by allocating a ticket in one of
the _ordered() methods, and doing its own synchronisation in
its commit() method. After all, it may be beneficial or necessary in
some cases. The point is that different plugins can use different methods,
each using the one that works best for that particular engine without getting
in the way of each other.
Further improvements
If we implement this approach, there are a couple of other interesting
enhancements that can be implemented relatively easy due to the commit
ordering facilities:
Currently, we sync to disk three times per commit to ensure consistency
between InnoDB and binlog after a crash. But if we know the commit order is
the same in engine and in binlog, and if we store in the engine the
corresponding binlog position (which InnoDB already does), then we need only
sync once (for the binlog) and can still recover reliably after a
crash. Since we have a consistent commit order, we can during crash recovery replay
the binlog from the position after the last not lost commit inside InnoDB
(just like we would apply the binlog on a slave).
Currently, the START TRANSACTION WITH CONSISTENT SNAPSHOT,
which is supposed to run a transaction with a consistent view in multiple
transactional engines, is not really all that consistent. It is quite
possible to see a transaction committed in one engine but not in another,
and vice versa. However, with an architecture like the one proposed here, it
should be easy to just take the snapshot under the same lock
that commit_ordered() runs under, and the snapshot will be
really consistent (on engines that support commit order). As a bonus, it
would also be possible to provice a binlog position corresponding to the
consistent snapshot.
XtraDB (and similar backup solutions) should be able to create a backup
which includes a binlog position (suitable for provisioning a new slave)
without having to run FLUSH TABLES WITH READ LOCK, which can be
quite costly as it blocks all transaction processing while it runs.
As already mentioned, the Facebook group has some ideas for releasing InnoDB
row locks early in order to reduce the load on hot-spot rows; this requires
consistent commit order.
Implementation
If anyone is interested in looking at the actual code of the proof-of-concept
implementation, it is available as
a quilt patch
series and as
a Launchpad
bzr tree (licence is GPLv2).
Do be aware that this is work in progress.
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Mon, 31 May 2010 14:17:25 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:5:{i:0;a:5:{s:4:"data";s:12:"freesoftware";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:7:"mariadb";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:11:"performance";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:11:"programming";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:10590:"
(No
three-part series is complete without a part 4, right?)
Here is an analogy that describes well what group commit does. We have a bus
driving back and forth transporting people from A to B (corresponding
to fsync() "transporting" commits to durable storage on
disk). The group commit optimisation is to have the bus pick up everyone that
is waiting at A before driving to B, not drive people one by one. Makes sense,
huh? :-)
It is pretty obvious that this optimisation of having more than one person in
the bus can dramatically improve throughput, and it is the same for the group
commit optimisation. Here is a graph from a benchmark comparing stock MariaDB
5.1 vs. MariaDB patched with a proof-of-concept patch that enables group
commit:
When group commit is implemented, we see clearly how performance (measured in
queries per second) scales dramatically as the number of threads
increases. Whereas with stock MariaDB with no group commit, there is no
scaling at all. We also see that SSD is better than HDD (no surprise there),
but that with sufficient parallelism from the application, group commit can to
a large extent compensate for the slower disks.
This is the same benchmark as in
the first part
of the series. Binlog is enabled. Durability is enabled
with sync_binlog=1 and flush_log_at_trx_commit=1
(and disk cache disabled to prevent the disks lying about when data is
durable). The load is single-row transactions against a 1000000-row XtraDB
table. The benchmark is thus specifically designed to make
the fsync() calls at the end of commit the bottleneck.
I should remark that I did not really tune the servers used in the benchmark
for high parallelism (except for raising max_connections :-), and
I ran the client on the same machine as the server. So it is likely that there
are other effects than group commit influencing the performance at high
parallelism (especially on the SSD results, which I ran on my laptop). But I
just wanted to see if my group commit work scales with higher parallelism, and the
graphs clearly shows that it does!
Architecture
For this work, I have focused a lot on the API for storage engine and binlog
plugins (we do not have binlog plugins now, but this is something that we will
be working on in MariaDB later this year). I want a clean interface that
allows plugins to implement group commit in a simple and efficient manner.
A crucial point is the desire to get commits ordered the same way in the
different engines (ie. typically in InnoDB and in the binlog), as I discussed
in previous articles. As group commit is about parallelims, and ordering is
about serialisation, these two tend to get into conflict. My idea is to
introduce new calls in the interface to storage engines and the XA transaction
coordinator (which is how binlog interacts with commit internally in the
server). These new calls allow plugins that care about commit order to
cooperate on getting correct ordering without getting in each others way and
killing parallelims. Plugins that do not need any ordering can ignore the
new calls, which are optional (for example the transaction coordinator that
runs when the binlog is disabled does not need any ordering).
The full architecture is written up in detail in the
MariaDB Worklog#116.
But the basic idea is to introduce a new handlerton method:
void commit_ordered(handlerton *hton, THD *thd, bool all);
This is called just prior to the normal commit() method, and is
guaranteed to run in the same commit order across all engines (and binlog)
participating in the transaction.
This allows for a lot of flexibility in plugin implementations. A typical
implementation would in the commit_ordered() method write the
transaction data into its in-memory log buffer, and delay the
time-consuming write() and fsync() to the
parallel commit() method. InnoDB/XtraDB is already structured in
this way, so fits very well into this scheme.
But if an engine wants to use another approach, for example a
ticket-based approach
as Mark
and Mats
suggested, that is easy to do too. Just allocate the ticket
in commit_ordered(), and use it in commit(). I
believe most approaches should fit in well with the proposed model.
I also added a corresponding prepare_ordered() call, which runs
in commit order during the prepare phase. The intension is to provide a place
to release InnoDB row locks early for even better performance, though I still
need to get the Facebook people to explain exactly what they want to do in
this respect ;-)
I also spent a lot of thought on getting efficient inter-thread
synchronisation in the
archtecture. As Mats
mentioned, if one is not careful, it is easy to end up
with O(N2) cost of thread wake-up, with N
the number of transactions participating in group commit. As the goal is to get N
as high as possible to maximise sharing of the expensive fsync()
call, such O(N2) cost is to be avoided.
In the architecture
described in MariaDB
Worklog#116, there should in the normal case only be a single highly
contested lock, the one on the binlog group commit (which is inherent to the
idea of group commit, one thread does the fsync() while the rest
of participating threads wait). I use a lock-free queue to make threads
in prepare_ordered() not block threads
in commit_ordered() and vice
versa. The prepare_ordered() calls runs under a global lock, but
as they are intended to execute very quickly there should ideally be little contention
here. The commit_ordered() calls run in a loop in a single
thread, also avoiding serious lock contention as long as commit_ordered() runs
quickly as intended.
In particular, running the commit_ordered() loop in a single
thread for each group commit avoids high cost of thread wake-up. If we were to
try to run the sequential part of commit in different threads in a specific
commit order, we would need to switch execution context from one thread to the
next, bouncing the thread of control all over the cores in an SMP
system. Which takes lots of context switches, and could potentially be
costly. In the proposed architecture, a single thread runs
all commit_ordered() method calls and wakes up the other waiting
threads individually, each free to proceed immediately without any more
waiting for one another.
Of course, an engine/binlog plugin that so desires is free to implement such
thread-hopping itself, by allocating a ticket in one of
the _ordered() methods, and doing its own synchronisation in
its commit() method. After all, it may be beneficial or necessary in
some cases. The point is that different plugins can use different methods,
each using the one that works best for that particular engine without getting
in the way of each other.
Further improvements
If we implement this approach, there are a couple of other interesting
enhancements that can be implemented relatively easy due to the commit
ordering facilities:
- Currently, we sync to disk three times per commit to ensure consistency
between InnoDB and binlog after a crash. But if we know the commit order is
the same in engine and in binlog, and if we store in the engine the
corresponding binlog position (which InnoDB already does), then we need only
sync once (for the binlog) and can still recover reliably after a
crash. Since we have a consistent commit order, we can during crash recovery replay
the binlog from the position after the last not lost commit inside InnoDB
(just like we would apply the binlog on a slave).
- Currently, the
START TRANSACTION WITH CONSISTENT SNAPSHOT,
which is supposed to run a transaction with a consistent view in multiple
transactional engines, is not really all that consistent. It is quite
possible to see a transaction committed in one engine but not in another,
and vice versa. However, with an architecture like the one proposed here, it
should be easy to just take the snapshot under the same lock
that commit_ordered() runs under, and the snapshot will be
really consistent (on engines that support commit order). As a bonus, it
would also be possible to provice a binlog position corresponding to the
consistent snapshot.
- XtraDB (and similar backup solutions) should be able to create a backup
which includes a binlog position (suitable for provisioning a new slave)
without having to run
FLUSH TABLES WITH READ LOCK, which can be
quite costly as it blocks all transaction processing while it runs.
-
As already mentioned, the Facebook group has some ideas for releasing InnoDB
row locks early in order to reduce the load on hot-spot rows; this requires
consistent commit order.
Implementation
If anyone is interested in looking at the actual code of the proof-of-concept
implementation, it is available as
a quilt patch
series and as
a Launchpad
bzr tree (licence is GPLv2).
Do be aware that this is work in progress.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:16:"Kristian Nielsen";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:9;a:6:{s:4:"data";s:38:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:5:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:22:"MySQL in openSUSE 11.3";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:74:"http://michal.hrusecky.net/index.php/blog/show/MySQL-in-openSUSE-11.3.html";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:74:"http://michal.hrusecky.net/index.php/blog/show/MySQL-in-openSUSE-11.3.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:2978:"Original title was 'MySQL dropped from openSUSE!!!'. I wanted to have some shocking title, but I changed it as I don't really want to scare you so much But it is partially true, there is no mysql package in openSUSE anymore. But of course we DID NOT really dropped MySQL. In fact, we now have more MySQL in openSUSE then we ever had! Do I got your attention? Read on
What and why?
What really happened is that I renamed original MySQL package. Now it is called mysql-community-server. If you take a look at SUN/Oracle web, they call it like this for a long time, so it makes a little sense... As a result, there is no real package called mysql in openSUSE anymore. But mysql-community-server provides mysql so even if you try to install mysql, it will work. This change also have one funny consequence. Do you remember package mysql-client? Now it's called mysql-community-server-client
Why such a change? I haven't done something like this just because such a little difference in the naming of packages. No, I had some other plans as well. Reason to rename original MySQL package was to provide some space for other possible mysql providers. And we have them in 11.3! These other providers are MariaDB and MySQL Cluster. So now you can choose which MySQL do you want And as all of them provides the same symbol, namely mysql, you can use whichever you want to satisfy dependencies. But the default one is still mysql-community-server. So if you don't need anything special, nearly nothing changed for you.
Alternatives
So let's take a brief look at what MySQL Community Server alternatives do we now have in openSUSE...
MariaDB
MariaDB is a backward compatible, drop-in replacement branch of the MySQL Database Server. That means that differences in the user interface are minimal. You probably wouldn't even notice on the first sight if they get replaced. So what are the differences? The best way is to read official page about differences:
http://askmonty.org/wiki/MariaDB_versus_MySQL
Basically MariaDB incorporates many community provided patches out there and some more bleeding edge storage engines. If you want to try it, you can simply type
zypper in mariadb
And zypper will take care of conflicts and replace your MySQL with Maria. If you want to get back to MySQL, it's similarly simple, just use
zypper in mysql-community-server
MySQL Cluster
Other MySQL alternative is MySQL Cluster. This should provide real clustering support for MySQL. And it is also distributed and maintained by Oracle. If you are interested in HA, you have many servers, you might be interested in this. For more information check official web (we included version 7.0 in 11.3):
http://www.mysql.com/products/database/cluster/
If you want to try it, you can simply type
zypper in mysql-cluster
And zypper will take care of conflicts and replace your ordinary MySQL with MySQL Cluster. If you want to get back to MySQL, it's similarly simple, just use
zypper in mysql-community-server";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Mon, 31 May 2010 13:44:02 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:4626:"Original title was 'MySQL dropped from openSUSE!!!'. I wanted to have some shocking title, but I changed it as I don't really want to scare you so much
But it is partially true, there is no mysql package in openSUSE anymore. But of course we DID NOT really dropped MySQL. In fact, we now have more MySQL in openSUSE then we ever had! Do I got your attention? Read on 
What and why?
What really happened is that I renamed original MySQL package. Now it is called mysql-community-server. If you take a look at SUN/Oracle web, they call it like this for a long time, so it makes a little sense... As a result, there is no real package called mysql in openSUSE anymore. But mysql-community-server provides mysql so even if you try to install mysql, it will work. This change also have one funny consequence. Do you remember package mysql-client? Now it's called mysql-community-server-client 
Why such a change? I haven't done something like this just because such a little difference in the naming of packages. No, I had some other plans as well. Reason to rename original MySQL package was to provide some space for other possible mysql providers. And we have them in 11.3! These other providers are MariaDB and MySQL Cluster. So now you can choose which MySQL do you want
And as all of them provides the same symbol, namely mysql, you can use whichever you want to satisfy dependencies. But the default one is still mysql-community-server. So if you don't need anything special, nearly nothing changed for you.
Alternatives
So let's take a brief look at what MySQL Community Server alternatives do we now have in openSUSE...
MariaDB
MariaDB is a backward compatible, drop-in replacement branch of the MySQL Database Server. That means that differences in the user interface are minimal. You probably wouldn't even notice on the first sight if they get replaced. So what are the differences? The best way is to read official page about differences:
http://askmonty.org/wiki/MariaDB_versus_MySQL
Basically MariaDB incorporates many community provided patches out there and some more bleeding edge storage engines. If you want to try it, you can simply type
zypper in mariadb
And zypper will take care of conflicts and replace your MySQL with Maria. If you want to get back to MySQL, it's similarly simple, just use
zypper in mysql-community-server
MySQL Cluster
Other MySQL alternative is MySQL Cluster. This should provide real clustering support for MySQL. And it is also distributed and maintained by Oracle. If you are interested in HA, you have many servers, you might be interested in this. For more information check official web (we included version 7.0 in 11.3):
http://www.mysql.com/products/database/cluster/
If you want to try it, you can simply type
zypper in mysql-cluster
And zypper will take care of conflicts and replace your ordinary MySQL with MySQL Cluster. If you want to get back to MySQL, it's similarly simple, just use
zypper in mysql-community-server
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:17:"Michal Hrušecký";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:10;a:6:{s:4:"data";s:88:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:44:"OpenSQLCamp EU 2010 - Call for participation";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:59:"tag:blogger.com,1999:blog-16959946.post-6567168214584908003";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:84:"http://datacharmer.blogspot.com/2010/05/opensqlcamp-2010-call-for-participation.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:1228:"The European OpenSQLCamp 2010 will take place in parallel to the Free and Open Source Conference 2010 (FrOSCon) on Saturday 21st and Sunday 22nd August at the Fachhochschule Bonn-Rhein-Sieg in St. Augustin, Germany. St. Augustin is located close to Bonn and Cologne.The Call for Participation is now online.The event is organized by yours truly and Felix Schupp, and we are open to cooperation from other volunteers.Specifically, we need help to beat the drum. Even if you can't participate, we will appreciate your help in making the Call for Participation known. OpenSQLCamp2010 will use the FrOSCon's Pentabarf conference coordination system to collect talk submissions and perform the organizing and scheduling of the talks.Please create an account there, if you don't have one already. Once you have activated your account via the email address you provided, please log into the system and create a new event. Make sure to select track OpenSQLCamp for your submission!IMPORTANT! - FrOSCon uses CA certificates. If you browser does not recognize them, then you need to Import the CAcert Root Certificate before using the CfP pages. The deadline for submitting your proposal is Sunday, July 11th, 2010 (12:00pm PST).";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Mon, 31 May 2010 07:50:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:10:{i:0;a:5:{s:4:"data";s:11:"open source";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:10:"conference";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:6:"europe";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:3:"cfp";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:4:"2010";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:11:"opensqlcamp";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:7;a:5:{s:4:"data";s:9:"community";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:8;a:5:{s:4:"data";s:7:"talking";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:9;a:5:{s:4:"data";s:3:"SQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:2705:"
The event is organized by yours truly and Felix Schupp, and we are open to cooperation from other volunteers.
Specifically, we need help to beat the drum. Even if you can't participate, we will appreciate your help in making the Call for Participation known.
OpenSQLCamp2010 will use the FrOSCon's Pentabarf conference coordination system to collect talk submissions and perform the organizing and scheduling of the talks.
Please create an account there, if you don't have one already. Once you have activated your account via the email address you provided, please log into the system and create a new event. Make sure to select track OpenSQLCamp for your submission!
IMPORTANT! - FrOSCon uses CA certificates. If you browser does not recognize them, then you need to Import the CAcert Root Certificate before using the CfP pages.
The deadline for submitting your proposal is Sunday, July 11th, 2010 (12:00pm PST).
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:14:"Giuseppe Maxia";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:11;a:6:{s:4:"data";s:58:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:47:"Wearing a Red Tie (or a T-Shirt if you prefer).";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:70:"tag:blogger.com,1999:blog-8877901999053801110.post-3201109296942124855";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:110:"http://feedproxy.google.com/~r/ItsJustAboutCommunication/~3/2NTtVdFsMTA/wearing-red-tie-or-t-shirt-if-you.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:847:"Starting from tomorrow (1st of July) I will officially become an Oracle employee. It is quite funny how many companies you can change without even applying for another job. ;-)
I've been hired by MySQL in the beginning of 2008, a few months later it has been acquired by Sun and after about a year we've been acquired again, this time by Oracle.
I personally consider this a new beginning and I join Oracle with a lot of enthusiasm. After all I'm now part of a company that has a huge set of products and technologies and it is like a playground for me.
I'm afraid I've not blogged frequently in these months, but I plan to write more and more in the future. This short post is just to wish good luck to all the Dolphins who have joined Oracle and all those who are swimming in a different ocean.
Go MySQL! Go Oracle!
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Mon, 31 May 2010 07:11:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:4:{i:0;a:5:{s:4:"data";s:12:"luca olivari";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:6:"myself";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:6:"oracle";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:3848:"Starting from tomorrow (1st of July) I will officially become an Oracle employee. It is quite funny how many companies you can change without even applying for another job. ;-)
I've been hired by MySQL in the beginning of 2008, a few months later it has been acquired by Sun and after about a year we've been acquired again, this time by Oracle.
I personally consider this a new beginning and I join Oracle with a lot of enthusiasm. After all I'm now part of a company that has a huge set of products and technologies and it is like a playground for me.
I'm afraid I've not blogged frequently in these months, but I plan to write more and more in the future. This short post is just to wish good luck to all the Dolphins who have joined Oracle and all those who are swimming in a different ocean.
Go MySQL! Go Oracle!



PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:12:"Luca Olivari";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:12;a:6:{s:4:"data";s:68:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:29:"Fast paging in the real world";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:33:"http://openquery.com/blog/?p=1271";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:48:"http://openquery.com/blog/fast-paging-real-world";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:6495:"This blag was originally posted at http://cafuego.net/2010/05/26/fast-paging-real-world
Some time ago I attended the “Optimisation by Design” course from Open Query¹. In it, Arjen teaches how writing better queries and schemas can make your database access much faster (and more reliable). One such way of optimising things is by adding appropriate query hints or flags. These hints are magic strings that control how a server executes a query or how it returns results.
An example of such a hint is SQL_CALC_FOUND_ROWS. You use it in a select query with a LIMIT clause. It instructs the server to select a limited numbers of rows, but also to calculate the total number of rows that would have been returned without the limit clause in place. That total number of rows is stored in a session variable, which can be retrieved via SELECT FOUND_ROWS(); That simply reads the variable and clears it on the server, it doesn’t actually have to look at any table or index data, so it’s very fast.
This is useful when queries are used to generate pages of data where a user can click a specific page number or click previous/next page. In this case you need the total number of rows to determine how many pages you need to generate links for.
The traditional way is to first run a SELECT COUNT(*) query and then select the rows you want, with LIMIT. If you don’t use a WHERE clause in your query, this can be pretty fast on MyISAM, as it has a magic variable that contains the number of rows in a table. On InnoDB however, which is my storage engine of choice, there is no such variable and consequently it’s not pretty fast.
Paging Drupal
At DrupalConSF earlier this year I’d floated the idea of making Drupal 7 use SQL_CALC_FOUND_ROWS in its pager queries. These are queries generated specifically to display paginated lists of content and the API to do this is pretty straightforward. To do it I needed to add query hint support to the MySQL driver. When it turned out that PostgreSQL and Oracle also support query hints though, the aim became adding hint support for all database drivers.
That’s now done, though only the patch only implements hints on the pager under MySQL at the moment.
One issue keeps cropping up though, a blog by Alexey Kovyrin in 2007 that states SELECT COUNT(*) is faster than using SQL_CALC_FOUND_ROWS. It’s all very well to not have a patch accepted if that statement is correct, but in my experience that is in fact not the case. In my experience the stats are in fact the other way around, SQL_CALC_FOUND_ROWS is nearly always faster than SELECT COUNT(*).
To back up my claims I thought I should run some benchmarks.
I picked the Drupal pager query that lists content (nodes) on the content administration page. It selects node IDs from the node table with a WHERE clause which filters by the content language. Or, in plain SQL, what currently happens is:
SELECT COUNT(*) FROM node WHERE language = 'und';
SELECT nid FROM node WHERE language = 'und' LIMIT 0,50;
and what I’d like to happen is:
SELECT SQL_CALC_FOUND_ROWS nid FROM node WHERE language = 'und' LIMIT 0,50;
SELECT FOUND_ROWS();
Methodology
I ran two sets of tests. One on a node table with 5,000 rows and one with 200,000 rows. For each of these table sizes I ran a pager with 10, 20, 50, 100 and 200 loops, each time increasing the offset by 50; effectively paging through the table. I ran all these using both MyISAM and InnoDB as the storage engine for the node table and I ran them on two machines. One was my desktop, a dual core Athlon X2 5600 with 4Gb of RAM and the other is a single core Xen virtual machine with 512Mb of RAM.
I was hoping to also run tests with 10,000,000 rows, but the virtual machine did not complete any of the queries. So I ran these on my desktop machine only. Again for 10, 20, 50, 100 and 200 queries per run. First with an offset of 50, then with an offset of 10,000. I restarted the MySQL server between each run. To discount query cache advantages, I ran all tests with the query cache disabled. The script I used is attached at the bottom of this post. The calculated times do include the latency of client/server communication, though all tests ran via the local socket connection.
My desktop runs an OurDelta mysql .5.0.87 (the -d10-ourdelta-sail66) to be exact. The virtual machine runs 5.0.87 (-d10-ourdelta65). Before you complain that not running a vanilla MySQL invalidates the results, I run these because I am able to tweak InnoDB a bit more, so the I/O write load on the virtual machine is somewhat reduced compared to the vanilla MySQL.
Results
The graphs show that using SQL_CALC_FOUND_ROWS is virtually always faster than running two queries that each need to look at actual data. Even when using MyISAM. As the database gets bigger, the speed advantage of SQL_CALC_FOUND_ROWS increases. At the 10,000,000 row mark, it’s consistently about twice as fast.
Also interesting is that InnoDB seems significantly slower than MyISAM on the shorter runs. I say seems, because (especially with the 10,000,000 row table) the delay is caused by InnoDB first loading the table from disk into its buffer pool. In the spreadsheet you can see the first query takes up to 40 seconds, whilst subsequent ones are much faster. The MyISAM data is still in the OS file cache, so it doesn’t have that delay on the first query. Because I use innodb_flush_method=O_DIRECT, the InnoDB data is not kept in the OS file cache.
Conclusion
So, it’s official. COUNT(*) is dead, long live SQL_CALC_FOUND_ROWS! :-)
I’ve attached my raw results as a Gnumeric document, so feel free to peruse them. The test script I’ve used is also attached, so you can re-run the benchmark on your own systems if you wish.
Conclusion Addendum
As pointed out in the Drupal pager issue that caused me to run these tests, the query I’m benchmarking uses the language column, which is not indexed and the test also doesn’t allow the server to cache the COUNT(*) query. I’ve rerun the tests with 10 million rows after adding an index and I no longer get a signification speed difference between the two ways of getting the total number of rows.
So I suppose that at least SQL_CALC_FOUND_ROWS will cause your non-indexed pager queries to suck a lot less than they might otherwise and it won’t hurt if they are properly indexed
¹ I now work for Open Query as a consultant.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Mon, 31 May 2010 03:36:55 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:6:{i:0;a:5:{s:4:"data";s:13:"Uncategorized";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"COUNT";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:6:"drupal";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:6:"InnoDB";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:19:"SQL_CALC_FOUND_ROWS";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:7846:"This blag was originally posted at http://cafuego.net/2010/05/26/fast-paging-real-world
Some time ago I attended the “Optimisation by Design” course from Open Query¹. In it, Arjen teaches how writing better queries and schemas can make your database access much faster (and more reliable). One such way of optimising things is by adding appropriate query hints or flags. These hints are magic strings that control how a server executes a query or how it returns results.
An example of such a hint is SQL_CALC_FOUND_ROWS. You use it in a select query with a LIMIT clause. It instructs the server to select a limited numbers of rows, but also to calculate the total number of rows that would have been returned without the limit clause in place. That total number of rows is stored in a session variable, which can be retrieved via SELECT FOUND_ROWS(); That simply reads the variable and clears it on the server, it doesn’t actually have to look at any table or index data, so it’s very fast.
This is useful when queries are used to generate pages of data where a user can click a specific page number or click previous/next page. In this case you need the total number of rows to determine how many pages you need to generate links for.
The traditional way is to first run a SELECT COUNT(*) query and then select the rows you want, with LIMIT. If you don’t use a WHERE clause in your query, this can be pretty fast on MyISAM, as it has a magic variable that contains the number of rows in a table. On InnoDB however, which is my storage engine of choice, there is no such variable and consequently it’s not pretty fast.
Paging Drupal
At DrupalConSF earlier this year I’d floated the idea of making Drupal 7 use SQL_CALC_FOUND_ROWS in its pager queries. These are queries generated specifically to display paginated lists of content and the API to do this is pretty straightforward. To do it I needed to add query hint support to the MySQL driver. When it turned out that PostgreSQL and Oracle also support query hints though, the aim became adding hint support for all database drivers.
That’s now done, though only the patch only implements hints on the pager under MySQL at the moment.
One issue keeps cropping up though, a blog by Alexey Kovyrin in 2007 that states SELECT COUNT(*) is faster than using SQL_CALC_FOUND_ROWS. It’s all very well to not have a patch accepted if that statement is correct, but in my experience that is in fact not the case. In my experience the stats are in fact the other way around, SQL_CALC_FOUND_ROWS is nearly always faster than SELECT COUNT(*).
To back up my claims I thought I should run some benchmarks.
I picked the Drupal pager query that lists content (nodes) on the content administration page. It selects node IDs from the node table with a WHERE clause which filters by the content language. Or, in plain SQL, what currently happens is:
SELECT COUNT(*) FROM node WHERE language = 'und';
SELECT nid FROM node WHERE language = 'und' LIMIT 0,50;
and what I’d like to happen is:
SELECT SQL_CALC_FOUND_ROWS nid FROM node WHERE language = 'und' LIMIT 0,50;
SELECT FOUND_ROWS();
Methodology
I ran two sets of tests. One on a node table with 5,000 rows and one with 200,000 rows. For each of these table sizes I ran a pager with 10, 20, 50, 100 and 200 loops, each time increasing the offset by 50; effectively paging through the table. I ran all these using both MyISAM and InnoDB as the storage engine for the node table and I ran them on two machines. One was my desktop, a dual core Athlon X2 5600 with 4Gb of RAM and the other is a single core Xen virtual machine with 512Mb of RAM.
I was hoping to also run tests with 10,000,000 rows, but the virtual machine did not complete any of the queries. So I ran these on my desktop machine only. Again for 10, 20, 50, 100 and 200 queries per run. First with an offset of 50, then with an offset of 10,000. I restarted the MySQL server between each run. To discount query cache advantages, I ran all tests with the query cache disabled. The script I used is attached at the bottom of this post. The calculated times do include the latency of client/server communication, though all tests ran via the local socket connection.
My desktop runs an OurDelta mysql .5.0.87 (the -d10-ourdelta-sail66) to be exact. The virtual machine runs 5.0.87 (-d10-ourdelta65). Before you complain that not running a vanilla MySQL invalidates the results, I run these because I am able to tweak InnoDB a bit more, so the I/O write load on the virtual machine is somewhat reduced compared to the vanilla MySQL.
Results

The graphs show that using SQL_CALC_FOUND_ROWS is virtually always faster than running two queries that each need to look at actual data. Even when using MyISAM. As the database gets bigger, the speed advantage of SQL_CALC_FOUND_ROWS increases. At the 10,000,000 row mark, it’s consistently about twice as fast.
Also interesting is that InnoDB seems significantly slower than MyISAM on the shorter runs. I say seems, because (especially with the 10,000,000 row table) the delay is caused by InnoDB first loading the table from disk into its buffer pool. In the spreadsheet you can see the first query takes up to 40 seconds, whilst subsequent ones are much faster. The MyISAM data is still in the OS file cache, so it doesn’t have that delay on the first query. Because I use innodb_flush_method=O_DIRECT, the InnoDB data is not kept in the OS file cache.
Conclusion
So, it’s official. COUNT(*) is dead, long live SQL_CALC_FOUND_ROWS! :-)
I’ve attached my raw results as a Gnumeric document, so feel free to peruse them. The test script I’ve used is also attached, so you can re-run the benchmark on your own systems if you wish.
Conclusion Addendum
As pointed out in the Drupal pager issue that caused me to run these tests, the query I’m benchmarking uses the language column, which is not indexed and the test also doesn’t allow the server to cache the COUNT(*) query. I’ve rerun the tests with 10 million rows after adding an index and I no longer get a signification speed difference between the two ways of getting the total number of rows.
So I suppose that at least SQL_CALC_FOUND_ROWS will cause your non-indexed pager queries to suck a lot less than they might otherwise and it won’t hurt if they are properly indexed
¹ I now work for Open Query as a consultant.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:10:"Open Query";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:13;a:6:{s:4:"data";s:78:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:59:"MySQL Sandbox now with plugins, more tests, instrumentation";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:59:"tag:blogger.com,1999:blog-16959946.post-7929248491486139547";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:80:"http://datacharmer.blogspot.com/2010/05/mysql-sandbox-now-with-plugins-more.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:1742:"The latest release of MySQL Sandbox, 3.0.12, has integrated plugin installation features, as mentioned in my previous post.Not only that. This version has also more tests, fixes a couple of bugs, and introduces basic instrumentation. Now each script released with MySQL Sandbox, and every one that the Sandbox itself installs, can leave a trail in a file.Let's start with the plugin. The documentation has been updated to cover this new feature. And 27 new tests give me some confidence that it should work as advertised.While I was waiting for the test suite to finish its 238 tests, I was wondering how much was going on under the hood. So I spent one hour implementing some basic instrumentation, not only in the make_* scripts, but also in every script that the sandbox installs. The code is quite modular, and adding this feature was easy.Now, if you want to use this instrumentation, you need to create a file, and set the operating system variable $SBINSTR to the full path of that file prior to using the Sandbox. Then, every script will leave an entry in that file, saying its name, the current time, and which parameters was using.This is what I got after running the test suite. 66 instances of MySQL installed to perform over 200 tests, in about 18 minutes.MySQL Sandbox scriptscallsmake_sandbox 66low_level_make_sandbox 66make_replication_sandbox 8make_multiple_sandbox 7make_multiple_custom_sandbox 2Installed scriptscallsuse 440stop 192start 128clear 56sandbox_action 56sbtool 34stop_all 30use_all 20clear_all 13start_all 12send_kill 11restart 9initialize_slaves 8restart_all 4change_paths 2change_ports 1total 1165The new release is available from Launchpad or directly from the CPAN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Sun, 30 May 2010 19:01:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:8:{i:0;a:5:{s:4:"data";s:7:"sandbox";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:6:"plugin";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:15:"instrumentation";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:6:"innodb";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:11:"replication";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:7:"testing";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:7;a:5:{s:4:"data";s:9:"semisynch";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:3880:" | The latest release of MySQL Sandbox, 3.0.12, has integrated plugin installation features, as mentioned in my previous post. Not only that. This version has also more tests, fixes a couple of bugs, and introduces basic instrumentation. Now each script released with MySQL Sandbox, and every one that the Sandbox itself installs, can leave a trail in a file.
|
Let's start with the plugin. The documentation has been updated to cover this new feature. And 27 new tests give me some confidence that it should work as advertised.
While I was waiting for the test suite to finish its 238 tests, I was wondering how much was going on under the hood. So I spent one hour implementing some basic instrumentation, not only in the make_* scripts, but also in every script that the sandbox installs. The code is quite modular, and adding this feature was easy.
Now, if you want to use this instrumentation, you need to create a file, and set the operating system variable $SBINSTR to the full path of that file prior to using the Sandbox. Then, every script will leave an entry in that file, saying its name, the current time, and which parameters was using.
This is what I got after running the test suite. 66 instances of MySQL installed to perform over 200 tests, in about 18 minutes.
| MySQL Sandbox scripts | calls |
|---|
| make_sandbox | 66 |
| low_level_make_sandbox | 66 |
| make_replication_sandbox | 8 |
| make_multiple_sandbox | 7 |
| make_multiple_custom_sandbox | 2 |
| Installed scripts | calls |
|---|
| use | 440 |
| stop | 192 |
| start | 128 |
| clear | 56 |
| sandbox_action | 56 |
| sbtool | 34 |
| stop_all | 30 |
| use_all | 20 |
| clear_all | 13 |
| start_all | 12 |
| send_kill | 11 |
| restart | 9 |
| initialize_slaves | 8 |
| restart_all | 4 |
| change_paths | 2 |
| change_ports | 1 |
| total | 1165 |
|---|
The new release is available from Launchpad or directly from the CPAN
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:14:"Giuseppe Maxia";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:14;a:6:{s:4:"data";s:88:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:63:"Advanced Squid Caching in Scribd: Cache Invalidation Techniques";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:25:"http://kovyrin.net/?p=322";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:59:"http://feedproxy.google.com/~r/Homo-Adminus/~3/4ywVA01ppFY/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:3727:"Having a reverse-proxy web cache as one of the major infrastructure elements brings many benefits for large web applications: it reduces your application servers load, reduces average response times on your site, etc. But there is one problem every developer experiences when works with such a cache – cached content invalidation.
It is a complex problem that usually consists of two smaller ones: individual cache elements invalidation (you need to keep an eye on your data changes and invalidate cached pages when related data changes) and full cache purges (sometimes your site layout or page templates change and you need to purge all the cached pages to make sure users will get new visual elements of layout changes). In this post I’d like to look at a few techniques we use at Scribd to solve cache invalidation problems.
So, the first problem – ongoing cache invalidation when content changes. This is actually a pretty simple task in squid: you just use HTCP protocol and send CLR requests to your caching farm (we didn’t find any HTCP protocol implementations so we’ve implemented our own simple client that supports just one command).
Since we use haproxy to balance our traffic in the cluster it is hard to predict where should we send a purge request. So we fan those out to all cache servers.
To make sure cache purging won’t slow the site down, especially considering we need to do more that just a simple cache purge (submit documents to search indexes, etc, etc), we just spool a “document changed” request to a queue and then have a set of asynchronous processes that do all the work in background.
Next, The Hard Problem – handling full cache purges w/o killing our backend servers with 5x-10x traffic (our normal hit ratio is ~90-95%).
We’ve spent a lot of time thinking about this problem and the first idea we came up with was to have a loop process somewhere that would iterate all documents we have cached and purge them one by one… but that does not seem to be a practical solution when you have tens of millions documents (and few page versions per document) and obviously the solution would not scale with constantly growing documents corpus.
So we kept brainstorming and finally got one idea that works just perfectly for us: what if we’d be able to take our traffic and define a function f(t) that would return a percentage of the traffic that should be purged at any moment in time. So we did it – we’ve implemented a nginx module that would version our cache by assigning every cached page a revision (using a custom HTTP-headers + Vary-caching) and would be able to slowly migrate the cache from one revision to another over a pre-defined period of time.
Having that module we are able to do so called “slow” cache purges that could take any time from a few minutes (that still helps to reduce the load spike generated by the hottest content) up to many hours (this is what we normally use) or days (never used this option, but it is definitely possible).
Here is an example 100% cache purge over an 8 hour interval:
Daily hit ratio graph:
Weekly hit ratio graph:
As you can see, during those slow purges our cached pages would be slowly updated without putting too much pressure on the backend. Cache hit ratio would slowly degrade and then slowly get back to its normal levels, but with our normal (6-8 hours) purges hit ratio never gets lower that 65-70% which makes it possible for us to save huge amounts of money on not having 90% spare capacity just for the cache purge load surges (we used to have lots of spare application cluster capacity before introducing this approach).
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Sat, 29 May 2010 17:02:17 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:10:{i:0;a:5:{s:4:"data";s:10:"Admin-tips";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:11:"Development";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:11:"My Projects";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:8:"Networks";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:7:"caching";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:4:"HTCP";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:12:"invalidation";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:7;a:5:{s:4:"data";s:5:"Nginx";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:8;a:5:{s:4:"data";s:6:"plugin";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:9;a:5:{s:4:"data";s:5:"squid";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:6013:"Having a reverse-proxy web cache as one of the major infrastructure elements brings many benefits for large web applications: it reduces your application servers load, reduces average response times on your site, etc. But there is one problem every developer experiences when works with such a cache – cached content invalidation.
It is a complex problem that usually consists of two smaller ones: individual cache elements invalidation (you need to keep an eye on your data changes and invalidate cached pages when related data changes) and full cache purges (sometimes your site layout or page templates change and you need to purge all the cached pages to make sure users will get new visual elements of layout changes). In this post I’d like to look at a few techniques we use at Scribd to solve cache invalidation problems.
So, the first problem – ongoing cache invalidation when content changes. This is actually a pretty simple task in squid: you just use HTCP protocol and send CLR requests to your caching farm (we didn’t find any HTCP protocol implementations so we’ve implemented our own simple client that supports just one command).
Since we use haproxy to balance our traffic in the cluster it is hard to predict where should we send a purge request. So we fan those out to all cache servers.
To make sure cache purging won’t slow the site down, especially considering we need to do more that just a simple cache purge (submit documents to search indexes, etc, etc), we just spool a “document changed” request to a queue and then have a set of asynchronous processes that do all the work in background.
Next, The Hard Problem – handling full cache purges w/o killing our backend servers with 5x-10x traffic (our normal hit ratio is ~90-95%).
We’ve spent a lot of time thinking about this problem and the first idea we came up with was to have a loop process somewhere that would iterate all documents we have cached and purge them one by one… but that does not seem to be a practical solution when you have tens of millions documents (and few page versions per document) and obviously the solution would not scale with constantly growing documents corpus.
So we kept brainstorming and finally got one idea that works just perfectly for us: what if we’d be able to take our traffic and define a function f(t) that would return a percentage of the traffic that should be purged at any moment in time. So we did it – we’ve implemented a nginx module that would version our cache by assigning every cached page a revision (using a custom HTTP-headers + Vary-caching) and would be able to slowly migrate the cache from one revision to another over a pre-defined period of time.
Having that module we are able to do so called “slow” cache purges that could take any time from a few minutes (that still helps to reduce the load spike generated by the hottest content) up to many hours (this is what we normally use) or days (never used this option, but it is definitely possible).
Here is an example 100% cache purge over an 8 hour interval:
- Daily hit ratio graph:
- Weekly hit ratio graph:
As you can see, during those slow purges our cached pages would be slowly updated without putting too much pressure on the backend. Cache hit ratio would slowly degrade and then slowly get back to its normal levels, but with our normal (6-8 hours) purges hit ratio never gets lower that 65-70% which makes it possible for us to save huge amounts of money on not having 90% spare capacity just for the cache purge load surges (we used to have lots of spare application cluster capacity before introducing this approach).


PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:14:"Alexey Kovyrin";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:15;a:6:{s:4:"data";s:43:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:41:"Multiple query results and server details";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:48:"http://www.heidisql.com/rss.php?c=1,7&p=5867";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:46:"http://www.heidisql.com/forum.php?t=5867#p5867";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:734:"After having updated to the latest build via Help > "Check for updates" you will be able to see more than only the result of the last SELECT query in any "Query" tab. By default, HeidiSQL displays up to 10 result sets in subtabs. In case you want more just go toTools > Preferences > Dataand increase this value in Maximum number of query results. Different than before, these are the first result sets from your SQL code.Please note that HeidiSQL still does not separate multiple results from a stored procedure. Will be the next thing to implement soon.Also a minor new feature is the hint on the lower statusbar when hovering over the MySQL version. You will see various connection, server and client related details here.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Sat, 29 May 2010 11:08:49 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:1:{i:0;a:5:{s:4:"data";s:4:"News";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:1180:"After having updated to the latest build via Help > "Check for updates" you will be able to see more than only the result of the last SELECT query in any "Query" tab. By default, HeidiSQL displays up to 10 result sets in subtabs. In case you want more just go to
Tools > Preferences > Data
and increase this value in Maximum number of query results. Different than before, these are the first result sets from your SQL code.

Please note that HeidiSQL still does not separate multiple results from a stored procedure. Will be the next thing to implement soon.
Also a minor new feature is the hint on the lower statusbar when hovering over the MySQL version. You will see various connection, server and client related details here.

PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:13:"Ansgar Becker";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:16;a:6:{s:4:"data";s:43:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:33:"NDB ENGINE LIMITATIONS (rebuttal)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:70:"tag:blogger.com,1999:blog-3436373518787509996.post-2937582576238462656";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:73:"http://todointx.blogspot.com/2010/05/ndb-engine-limitations-rebuttal.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:9216:"Anirudh Tamsekar made a post yesterday that laid out a few of the limitations of MySQL Cluster that seem to cause him the most pain. However his assessment of the situation is quite misleading. A few of his statements are inaccurate but more than half of the limitations he sites to are out right false. Since comments on Anirudh's blog are being moderated, I give my rebuttal here, and site sources. · Database names, table names, and attribute names cannot be as long in NDB tables as with other table handlers. In NDB, attribute names are truncated to 31 characters, and if they are not unique after truncation, errors occur. Database names and table names can total a maximum of 122 charactersFalse: "Identifiers. Formerly (in MySQL 5.0 and earlier), database names, table names and attribute names could not be as long for NDB tables as tables using other storage engines, because attribute names were truncated internally. In MySQL 5.1 and later, names of MySQL Cluster databases, tables, and table columns follow the same rules regarding length as they do for any other storage engine."-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.html· NDB does not support prefix indexes; only entire fields can be indexed.True· A big limitation is that in MySQL 4.1 and 5.0, all cluster table rows are of fixed length. This means, for example, that if a table has one or more VARCHAR fields containing only relatively small values, more memory and disk space will be required when using the NDB storage engine than would be for the same table and data using the MyISAM engine. This issue is on the “to-fix” list for MySQL Cluster 5.1.False: As of 5.1 (which has been GA for over a year and a half) this limitation applies only to on-disk columns."* Variable-length column support. The NDBCLUSTER storage engine now supports variable-length column types for in-memory tables. Previously, for example, any Cluster table having one or more VARCHAR fields which contained only relatively small values, much more memory and disk space were required when using the NDBCLUSTER storage engine than would have been the case for the same table and data using the MyISAM engine. In other words, in the case of a VARCHAR column, such a column required the same amount of storage as a CHAR column of the same size. In MySQL 5.1, this is no longer the case for in-memory tables, where storage requirements for variable-length column types such as VARCHAR and BINARY are comparable to those for these column types when used in MyISAM tables (see Section 10.5, “Data Type Storage Requirements”). "-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.html· In NDB, the maximum number of metadata objects is limited to 20,000, including database tables, system tables, indexes, and BLOBs (binary large objects). This is a hard-coded limit that you cannot override with a configuration option.True: However the actual number is 20320.· The maximum permitted size of any one row in NDB is 8KB, not including data stored in BLOB columns (which are actually stored in a separate table internally).Somewhat true: There is actually a constant you can modify at compile time to increase the max row length (as number of 4 byte words)../storage/ndb/include/kernel/ndb_limits.h#define MAX_TUPLE_SIZE_IN_WORDS 2013and./storage/ndb/include/ndbapi/ndbapi_limits.h#define NDB_MAX_TUPLE_SIZE_IN_WORDS 2013· The maximum number of attributes per key in NDB is 32.True:· Autodiscovery of databases is not supported in NDB for multiple MySQL servers accessing the same cluster in MySQL Cluster. (You have to add each database manually on each SQL node.)False: "Autodiscovery of databases is now supported for multiple MySQL servers accessing the same MySQL Cluster. Formerly, autodiscovery in MySQL Cluster 5.1 and MySQL Cluster NDB 6.x releases required that a given mysqld was already running and connected to the cluster at the time that the database was created on a different mysqld—in other words, when a mysqld process connected to the cluster after a database named db_name was created, it was necessary to issue a CREATE DATABASE db_name or CREATE SCHEMA db_name statement on the “new” MySQL server when it first accesseed that MySQL Cluster. Beginning with MySQL Cluster NDB 6.2.16 and MySQL Cluster NDB 6.3.18, such a CREATE statement is no longer required. (Bug#39612)This also means that online schema changes in NDB tables are now possible. That is, the result of operations such as ALTER TABLE and CREATE INDEX performed on one SQL node in the cluster are now visible to the cluster's other SQL nodes without any additional action being taken."-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.html· MySQL replication does not work correctly in NDB if updates are done on multiple MySQL servers; replication between clusters is on the feature list for MySQL 5.1.False: Again, 5.1 is GA over a year and a half. No need to treat this is a forward looking statement."* Replication with MySQL Cluster. It is now possible to use MySQL replication with Cluster databases. For details, see Section 17.6, “MySQL Cluster Replication”.Circular Replication. Circular replication is also supported with MySQL Cluster, beginning with MySQL 5.1.18. See Section 17.6.10, “MySQL Cluster Replication: Multi-Master and Circular Replication”.* auto_increment_increment and auto_increment_offset. The auto_increment_increment and auto_increment_offset server system variables are supported for Cluster replication beginning with MySQL 5.1.20, MySQL Cluster NDB 6.2.5, and MySQL Cluster 6.3.2."-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.htmlHowever, since the SQL node cannot know of the statements executed on another SQL node only row based replication is supported:"Replication. Statement-based replication is not supported. Use --binlog-format=ROW (or --binlog-format=MIXED) when setting up cluster replication. See Section 17.6, “MySQL Cluster Replication”, for more information. "· ALTER TABLE is not fully locking in NDB when you’re running multiple MySQL servers.False: With the changes to support discovery of CREATE DATABASE in MySQL Cluster NDB 6.2.16 and MySQL Cluster NDB 6.3.18 a global schema lock was introduced which consistiently locks tables across all SQL nodes in the cluster during DDL operations· All storage and management nodes within a cluster in NDB must have the same architecture. This restriction does not apply to machines simply running SQL nodes or any other clients that may be accessing the cluster.Misleading: All nodes must be the same endian nature. However machines running "mysql" clients have no such restriction. SQL nodes themselves do have limitation of being the same endian type as the data nodes. i.e. A Linux x86 machine cannot be a mysqld (SQL node) front end to ndbd (DATA nodes) running Solaris Sparc.-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-exclusive-to-cluster.html· It is not possible to make online schema changes in NDB, such as those accomplished using ALTER TABLE or CREATE INDEX. (However, you can import or create a table that uses a different storage engine and then convert it to NDB by using ALTER TABLE tbl_name ENGINE=NDBCLUSTER;.) ALTER TABLE works on occasions, but all it does is create a new table with the new structure and then import the data. This generally causes an error as NDB hits a limit somewhere. It is strongly recommended that you not use ALTER TABLE to make online schema changes.False: Again with the schema changes... See above.· Adding or removing nodes online is not possible in NDB. (The cluster must be restarted in such cases.)False: "In MySQL Cluster NDB 7.0 (beginning with MySQL Cluster NDB 6.4.0) and later MySQL Cluster release series, it is possible to add new data nodes to a running MySQL Cluster by performing a rolling restart, so that the cluster and the data stored in it remain available to applications."-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.htmlRemoval of data nodes online is possible for nodes that were added online. Nodes that were in the cluster at initial system startup cannot be removed without a total shutdown and restore. However online removal is not thouroghly tested so it is not officially supported.· The maximum number of storage nodes within an NDB cluster is 48.True:· The total maximum number of nodes in a cluster in MySQL Cluster is 63. This number includes all MySQL servers (that is, SQL nodes), storage nodes, and management servers.False: "Starting with MySQL Cluster NDB 6.1.1, the total maximum number of nodes in a MySQL Cluster is 255, including all SQL nodes (MySQL Servers), API nodes (applications accessing the cluster other than MySQL servers), data nodes, and management servers. The total number of data nodes and management nodes beginning with this version is 63, of which up to 48 can be data nodes.Note: The limitation that a data node cannot have a node ID greater than 49 continues to apply."-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Fri, 28 May 2010 19:55:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:1:{i:0;a:5:{s:4:"data";s:17:"ndb mysql cluster";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:10777:"Anirudh Tamsekar made a post yesterday that laid out a few of the limitations of MySQL Cluster that seem to cause him the most pain. However his assessment of the situation is quite misleading. A few of his statements are inaccurate but more than half of the limitations he sites to are out right false. Since comments on Anirudh's blog are being moderated, I give my rebuttal here, and site sources.
· Database names, table names, and attribute names cannot be as long in NDB tables as with other table handlers. In NDB, attribute names are truncated to 31 characters, and if they are not unique after truncation, errors occur. Database names and table names can total a maximum of 122 characters
False: "Identifiers. Formerly (in MySQL 5.0 and earlier), database names, table names and attribute names could not be as long for NDB tables as tables using other storage engines, because attribute names were truncated internally. In MySQL 5.1 and later, names of MySQL Cluster databases, tables, and table columns follow the same rules regarding length as they do for any other storage engine."
-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.html
· NDB does not support prefix indexes; only entire fields can be indexed.
True
· A big limitation is that in MySQL 4.1 and 5.0, all cluster table rows are of fixed length. This means, for example, that if a table has one or more VARCHAR fields containing only relatively small values, more memory and disk space will be required when using the NDB storage engine than would be for the same table and data using the MyISAM engine. This issue is on the “to-fix” list for MySQL Cluster 5.1.
False: As of 5.1 (which has been GA for over a year and a half) this limitation applies only to on-disk columns.
"* Variable-length column support. The NDBCLUSTER storage engine now supports variable-length column types for in-memory tables.
Previously, for example, any Cluster table having one or more VARCHAR fields which contained only relatively small values, much more memory and disk space were required when using the NDBCLUSTER storage engine than would have been the case for the same table and data using the MyISAM engine. In other words, in the case of a VARCHAR column, such a column required the same amount of storage as a CHAR column of the same size. In MySQL 5.1, this is no longer the case for in-memory tables, where storage requirements for variable-length column types such as VARCHAR and BINARY are comparable to those for these column types when used in MyISAM tables (see Section 10.5, “Data Type Storage Requirements”). "
-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.html
· In NDB, the maximum number of metadata objects is limited to 20,000, including database tables, system tables, indexes, and BLOBs (binary large objects). This is a hard-coded limit that you cannot override with a configuration option.
True: However the actual number is 20320.
· The maximum permitted size of any one row in NDB is 8KB, not including data stored in BLOB columns (which are actually stored in a separate table internally).
Somewhat true: There is actually a constant you can modify at compile time to increase the max row length (as number of 4 byte words).
./storage/ndb/include/kernel/ndb_limits.h
#define MAX_TUPLE_SIZE_IN_WORDS 2013
and
./storage/ndb/include/ndbapi/ndbapi_limits.h
#define NDB_MAX_TUPLE_SIZE_IN_WORDS 2013
· The maximum number of attributes per key in NDB is 32.
True:
· Autodiscovery of databases is not supported in NDB for multiple MySQL servers accessing the same cluster in MySQL Cluster. (You have to add each database manually on each SQL node.)
False: "Autodiscovery of databases is now supported for multiple MySQL servers accessing the same MySQL Cluster. Formerly, autodiscovery in MySQL Cluster 5.1 and MySQL Cluster NDB 6.x releases required that a given mysqld was already running and connected to the cluster at the time that the database was created on a different mysqld—in other words, when a mysqld process connected to the cluster after a database named db_name was created, it was necessary to issue a CREATE DATABASE db_name or CREATE SCHEMA db_name statement on the “new” MySQL server when it first accesseed that MySQL Cluster. Beginning with MySQL Cluster NDB 6.2.16 and MySQL Cluster NDB 6.3.18, such a CREATE statement is no longer required. (Bug#39612)
This also means that online schema changes in NDB tables are now possible. That is, the result of operations such as ALTER TABLE and CREATE INDEX performed on one SQL node in the cluster are now visible to the cluster's other SQL nodes without any additional action being taken."
-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.html
· MySQL replication does not work correctly in NDB if updates are done on multiple MySQL servers; replication between clusters is on the feature list for MySQL 5.1.
False: Again, 5.1 is GA over a year and a half. No need to treat this is a forward looking statement.
"* Replication with MySQL Cluster. It is now possible to use MySQL replication with Cluster databases. For details, see Section 17.6, “MySQL Cluster Replication”.
Circular Replication. Circular replication is also supported with MySQL Cluster, beginning with MySQL 5.1.18. See Section 17.6.10, “MySQL Cluster Replication: Multi-Master and Circular Replication”.
* auto_increment_increment and auto_increment_offset. The auto_increment_increment and auto_increment_offset server system variables are supported for Cluster replication beginning with MySQL 5.1.20, MySQL Cluster NDB 6.2.5, and MySQL Cluster 6.3.2."
-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.html
However, since the SQL node cannot know of the statements executed on another SQL node only row based replication is supported:
"Replication. Statement-based replication is not supported. Use --binlog-format=ROW (or --binlog-format=MIXED) when setting up cluster replication. See Section 17.6, “MySQL Cluster Replication”, for more information. "
· ALTER TABLE is not fully locking in NDB when you’re running multiple MySQL servers.
False: With the changes to support discovery of CREATE DATABASE in MySQL Cluster NDB 6.2.16 and MySQL Cluster NDB 6.3.18 a global schema lock was introduced which consistiently locks tables across all SQL nodes in the cluster during DDL operations
· All storage and management nodes within a cluster in NDB must have the same architecture. This restriction does not apply to machines simply running SQL nodes or any other clients that may be accessing the cluster.
Misleading: All nodes must be the same endian nature. However machines running "mysql" clients have no such restriction. SQL nodes themselves do have limitation of being the same endian type as the data nodes. i.e. A Linux x86 machine cannot be a mysqld (SQL node) front end to ndbd (DATA nodes) running Solaris Sparc.
-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-exclusive-to-cluster.html
· It is not possible to make online schema changes in NDB, such as those accomplished using ALTER TABLE or CREATE INDEX. (However, you can import or create a table that uses a different storage engine and then convert it to NDB by using ALTER TABLE tbl_name ENGINE=NDBCLUSTER;.) ALTER TABLE works on occasions, but all it does is create a new table with the new structure and then import the data. This generally causes an error as NDB hits a limit somewhere. It is strongly recommended that you not use ALTER TABLE to make online schema changes.
False: Again with the schema changes... See above.
· Adding or removing nodes online is not possible in NDB. (The cluster must be restarted in such cases.)
False: "In MySQL Cluster NDB 7.0 (beginning with MySQL Cluster NDB 6.4.0) and later MySQL Cluster release series, it is possible to add new data nodes to a running MySQL Cluster by performing a rolling restart, so that the cluster and the data stored in it remain available to applications."
-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.html
Removal of data nodes online is possible for nodes that were added online. Nodes that were in the cluster at initial system startup cannot be removed without a total shutdown and restore. However online removal is not thouroghly tested so it is not officially supported.
· The maximum number of storage nodes within an NDB cluster is 48.
True:
· The total maximum number of nodes in a cluster in MySQL Cluster is 63. This number includes all MySQL servers (that is, SQL nodes), storage nodes, and management servers.
False: "Starting with MySQL Cluster NDB 6.1.1, the total maximum number of nodes in a MySQL Cluster is 255, including all SQL nodes (MySQL Servers), API nodes (applications accessing the cluster other than MySQL servers), data nodes, and management servers. The total number of data nodes and management nodes beginning with this version is 63, of which up to 48 can be data nodes.
Note: The limitation that a data node cannot have a node ID greater than 49 continues to apply."
-- http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-limitations-resolved.html
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:18:"Matthew Montgomery";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:17;a:6:{s:4:"data";s:68:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:45:"tcpdump errors on FreeBSD for mk-query-digest";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:38:"http://ronaldbradford.com/blog/?p=2680";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:88:"http://ronaldbradford.com/blog/tcpdump-errors-on-freebsd-for-mk-query-digest-2010-05-28/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:897:"While I use this tcpdump command for MySQL query analysis with mk-query-digest, I found recently that it didn’t work on FreeBSD
$ tcpdump -i bge0 port 3306 -s 65535 -x -n -q -tttt -c 5
tcpdump: syntax error
It left me perplexed and reading the man page seemed to indicate my options were valid. I tried a few variances just to be sure without success.
$ tcpdump -i bge0 -c 5 port 3306 -x
tcpdump: syntax error
$ tcpdump -i bge0 -c 5 port 3306 -q
tcpdump: syntax error
$ tcpdump -i bge0 -c 5 port 3306 -tttt
tcpdump: syntax error
The solution was actually quite simple in the end, it had nothing to do with the commands, it had everything to do with the order of them. Placing port as the last option solved the problem.
$ tcpdump -i bge0 -s 65535 -x -n -q -tttt -c 5 port 3306
$ uname -a
FreeBSD db4.example.com 6.3-RELEASE-p3 FreeBSD 6.3-RELEASE-p3 #0: Wed Jul 16 05:13:50 EDT 200
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Fri, 28 May 2010 19:50:39 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:6:{i:0;a:5:{s:4:"data";s:9:"Databases";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:12:"Professional";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:7:"freebsd";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:7:"maatkit";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:15:"mk-query-digest";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:1263:"While I use this tcpdump command for MySQL query analysis with mk-query-digest, I found recently that it didn’t work on FreeBSD
$ tcpdump -i bge0 port 3306 -s 65535 -x -n -q -tttt -c 5
tcpdump: syntax error
It left me perplexed and reading the man page seemed to indicate my options were valid. I tried a few variances just to be sure without success.
$ tcpdump -i bge0 -c 5 port 3306 -x
tcpdump: syntax error
$ tcpdump -i bge0 -c 5 port 3306 -q
tcpdump: syntax error
$ tcpdump -i bge0 -c 5 port 3306 -tttt
tcpdump: syntax error
The solution was actually quite simple in the end, it had nothing to do with the commands, it had everything to do with the order of them. Placing port as the last option solved the problem.
$ tcpdump -i bge0 -s 65535 -x -n -q -tttt -c 5 port 3306
$ uname -a
FreeBSD db4.example.com 6.3-RELEASE-p3 FreeBSD 6.3-RELEASE-p3 #0: Wed Jul 16 05:13:50 EDT 200
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:15:"Ronald Bradford";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:18;a:6:{s:4:"data";s:58:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:103:"Friday Tips #2: Migrating JSF 1.2 + RichFaces to Java EE 6, Embedded and Arquillian, EJB 3.1 Timer, ...";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:52:"http://blogs.sun.com/theaquarium/entry/friday_tips_2";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:52:"http://blogs.sun.com/theaquarium/entry/friday_tips_2";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:895:"
Here are some tips that have been recently published on Java EE 6 & GlassFish:
•
Migrating JSF 1.2 + RichFaces 3.x to Java EE 6 / GlassFish v3
•
Mercurial and OpenSolaris and GlassFish
•
How do I setup a DataSource in Embedded GlassFish when using Arquillian?
•
Learning GlassFish v3 Command Line Administration Interface (CLI)
•
Java EE 6: Understanding Contexts and Dependency Injection (CDI), Part 1 - (in Japanese)
•
Example of EJB 3.1 Stateful Session Bean and Servlet
•
Message Driven Bean Example with Servlet Client
•
EJB 3.1 Timer Simple Example
•
How to set up an Ubuntu, Glassfish, MySQL, Java stack
•
GlassFish 3.1 Milestone 1 - Clustering and App Versioning, Screencast #1 and #2
Let us know if you have seen or published a detailed tip like shown above and we'll be happy to share them.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Fri, 28 May 2010 19:00:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:4:{i:0;a:5:{s:4:"data";s:5:"HowTo";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:9:"glassfish";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:7:"javaee6";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:4:"tips";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:2563:"
Here are some tips that have been recently published on Java EE 6 & GlassFish:
Let us know if you have seen or published a detailed tip like shown above and we'll be happy to share them.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:12:"The Aquarium";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:19;a:6:{s:4:"data";s:78:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:52:"Log Buffer #190, A Carnival of the Vanities for DBAs";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:36:"http://www.pythian.com/news/?p=12601";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:85:"http://www.pythian.com/news/12601/log-buffer-190-a-carnival-of-the-vanities-for-dbas/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:1858:"Welcome to Log Buffer, the weekly roundup of database blogs. We’re back this week with a short Log Buffer #190. Only ten more issues, and we’ll be celebrating our 200th edition post.
Chen Shapira was eager to share news early this week, sending along her favorite picks on Tuesday.
Prof. Neil Gunther doesn’t like the way commercial load testing software distributes think times.
Miladin Modrakovic wants to add columns to a table and initialize them with values. When the table is huge, updates can take too long. No worries! Datapump will save the day!
Tom Kyte is upset because a vendor sent him his password by email.
Iggy Fernandez had fun at NoCOUG’s Spring Conference, as demonstrated in the pictures from the event.
Charles Hooper posted a small series on how column order in the query can impact performance. Sometimes. Maybe.
Chris Presley contributed a few good SQL Server archive articles:
One which talks about data compression commands in SQL 2008 and 2008 R2. And another which highlights a difference in SQL Business Development Studio (SQL BIDS) between SQL 2008 and 2008 R2 for anyone that uses SQL Integration Services (SSIS).
In Postgres news, Dave Page compares VoltDB to Postgres.
PGCon2010 ended this week with positive reviews on the “Hall Track” from The Endpoint Team.
For MySQL, Vadim posts FlashCache: tpcc workload – the last in a series on FlashCache testing when the cache is placed on Intel SSD card.
Ronald Branford notes in MySQL Best Practices: User Security – that it is critical that to not use the default MySQL installation security because it’s insecure.
Lastly we wrap up with a few more were archives added by Alex Fatkulin that didn’t get out last week.
Timing Improvements in Oracle 11GR2 and tips for the case of a slow lookup.
Until next week.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Fri, 28 May 2010 16:53:12 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:8:{i:0;a:5:{s:4:"data";s:16:"Group Blog Posts";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:10:"Log Buffer";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:5:"NoSQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:6:"Oracle";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:10:"PostgreSQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:10:"SQL Server";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:7;a:5:{s:4:"data";s:14:"Technical Blog";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:4208:"Welcome to Log Buffer, the weekly roundup of database blogs. We’re back this week with a short Log Buffer #190. Only ten more issues, and we’ll be celebrating our 200th edition post.
Chen Shapira was eager to share news early this week, sending along her favorite picks on Tuesday.
Prof. Neil Gunther doesn’t like the way commercial load testing software distributes think times.
Miladin Modrakovic wants to add columns to a table and initialize them with values. When the table is huge, updates can take too long. No worries! Datapump will save the day!
Tom Kyte is upset because a vendor sent him his password by email.
Iggy Fernandez had fun at NoCOUG’s Spring Conference, as demonstrated in the pictures from the event.
Charles Hooper posted a small series on how column order in the query can impact performance. Sometimes. Maybe.
Chris Presley contributed a few good SQL Server archive articles:
One which talks about data compression commands in SQL 2008 and 2008 R2. And another which highlights a difference in SQL Business Development Studio (SQL BIDS) between SQL 2008 and 2008 R2 for anyone that uses SQL Integration Services (SSIS).
In Postgres news, Dave Page compares VoltDB to Postgres.
PGCon2010 ended this week with positive reviews on the “Hall Track” from The Endpoint Team.
For MySQL, Vadim posts FlashCache: tpcc workload – the last in a series on FlashCache testing when the cache is placed on Intel SSD card.
Ronald Branford notes in MySQL Best Practices: User Security – that it is critical that to not use the default MySQL installation security because it’s insecure.
Lastly we wrap up with a few more were archives added by Alex Fatkulin that didn’t get out last week.
Timing Improvements in Oracle 11GR2 and tips for the case of a slow lookup.
Until next week.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:17:"The Pythian Group";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:20;a:6:{s:4:"data";s:53:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:38:"MySQL Replication for Backups and more";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:70:"tag:blogger.com,1999:blog-7953513766786004980.post-6688781157158134807";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:76:"http://blog.mysqlboy.com/2010/05/mysql-replication-for-backups-and-more.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:5590:"You might be fortunate enough to allow yourself some downtime, it is dependent on your application and business model. During this window it's possible for you to stop your MySQL daemon or lock your tables to give yourself a consistent backup of your data. Quite often this is a luxury that you cannot afford. If you are tied to a strict uptime that doesn't permit any interruption to your data availability then MySQL Replication could be the answer you need to grab that essential backup file. Once you've enabled Replication to a slave then you have the chance to backup by stopping the replication thread and mitigate the risk of corruption whilst securing your latest dataset. Using the slave will also negate any overhead a backup like mysqldump would have on your active Master server.Although in this case we are deploying Replication to take consistent backups of our data, there are many uses for the mechanism such as scaling out you solution, analytics and reporting, data distribution over geographically disperse locations and high availability.Replication at a high level overview works like so...For the sake of the guide we will use Server1 and Server2, Master and Slave respectively.Changes are made to data on the Master (Server1). These changes are logged in the Binary Log of the Master and are referred to as binary log events. The Slave (Server2) will then copy the Master's binary log events into it's Relay Log. The Slave will then replay the events that it has copied into the Relay Log to it's own data. The result is an identical dataset. There are many guides online to setting up replication but it's essentially an easy feat to complete. On your journey into replication you will encounter all sorts of extra options such as type of replication (statement, row, mixed), exclusion of certain tables or even whole databases.Setting Up Replication (quick guide)Before you read on, I assume you have MySQL installed on both Server and the servers are on the same network, communicating. This guide takes you down the basic route of replicating to a single slave.To ensure that Replication will work you need to ensure a few configuration settings are set. Check your my.cnf for the following attributes:Master;[mysqld]server_id=1log_bin = {filename} e.g. mysql-binSlave;[mysqld]server_id=10log_bin = {filename}relay_log = mysql-relay-binlog_slave_updates = 1read_only = 1The server-id attribute needs to be a unique integer. You may incur problems if you have duplicates on your network so set this to something unused. Best practice is to ensure you set the servier-id in my.cnf on all your servers as the MySQL will default to 1 and think it's a master. I frequently see the use of the last quartet of the server's IP address. Ensure that you're using binary logging on your master too, Replication will not work without it. Make sure that the lines skip-networking and bind-address are either commented out or deleted as these will make it impossible to connect to your network. Once your my.cnf complies with Replication's needs, restart the mysql daemon;OS commandshell$ mysqladmin -uroot -p shutdownshell$ service mysql startThe previous OS commands will ensure graceful cycle of your daemon. Run them on your command line.1. AuthenticationAt the beginning we need to create a user on the Master so that our slave's thread can access our data. Login to your master MySQL node and then add your replication userMySQL master commandmysql> GRANT REPLICATION SLAVE ON *.* TO `slave`@`Server2` IDENTIFIED BY 'password';This command will add a user to your Master that allows the Slave to connect.**It's best practice that this is account is used for no other purpose then replication.**2. Load DataThis step is optional. If you've got data in your tables already then this is a sensible step to undertake. Dump your data into your slave using the mysqldump tool;OS master commandshell$ mysqldump -u{user} -p{password} {--single-transaction|--lock-all-tables}--all-databases --master-data=1 --host=server1 | mysql -u{user}-p{password} --host=server2This mysqldump command will dump your data from Server1 and the pipe the output into Server2. You can also dump to a file using the *nix redirect;OS master commandshell$ mysqldump -u{user} -p{password} {--single-transaction|--lock-all-tables} --all-databases --master-data=1 > masterdata_dump.sqland then copy the output file to your slave and import it.3. Configure SlaveStart a MySQL session on your slave and enter the following command. This coordinates your replication. It tells the slave how to connect to your master server.MySQL slave commandmysql> CHANGE MASTER TOMASTER_HOST='ServerIP/FQDN',MASTER_USER='ReplClient',MASTER_PASSWORD='ClientPassword',MASTER_LOG_FILE='mysql-bin.000001',MASTER_LOG_POS=98;4. Start SlaveMySQL slave commandmysql> START SLAVE;All going well you should be up and running. Check your replication using commandMySQL slave commandmysql> SHOW SLAVE STATUS;Backing UpNow you have an operational Replication Setup you can explore the different tools on the market for backing up your data. A robust and thorough disaster recovery policy is a gem in the crown of the DBA. Without it one will need to keep an up-to-date CV handy at all times because disasters do happen! Plan a full backup and restore strategy, consider what backup window is available to you, how much data you can afford to lose, if you need 'point in time' restoration. Check out ProductionDBA.com's MySQL Blog for a great summary on MySQL Backup/Restore tools.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Fri, 28 May 2010 15:40:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:3:{i:0;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:6:"backup";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:11:"replication";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:6860:"You might be fortunate enough to allow yourself some downtime, it is dependent on your application and business model. During this window it's possible for you to stop your MySQL daemon or lock your tables to give yourself a consistent backup of your data. Quite often this is a luxury that you cannot afford. If you are tied to a strict uptime that doesn't permit any interruption to your data availability then MySQL Replication could be the answer you need to grab that essential backup file. Once you've enabled Replication to a slave then you have the chance to backup by stopping the replication thread and mitigate the risk of corruption whilst securing your latest dataset. Using the slave will also negate any overhead a backup like mysqldump would have on your active Master server.
Although in this case we are deploying Replication to take consistent backups of our data, there are many uses for the mechanism such as scaling out you solution, analytics and reporting, data distribution over geographically disperse locations and high availability.
Replication at a high level overview works like so...
For the sake of the guide we will use Server1 and Server2, Master and Slave respectively.
- Changes are made to data on the Master (Server1). These changes are logged in the Binary Log of the Master and are referred to as binary log events.
- The Slave (Server2) will then copy the Master's binary log events into it's Relay Log.
- The Slave will then replay the events that it has copied into the Relay Log to it's own data. The result is an identical dataset.
There are many guides online to setting up replication but it's essentially an easy feat to complete. On your journey into replication you will encounter all sorts of extra options such as type of replication (statement, row, mixed), exclusion of certain tables or even whole databases.
Setting Up Replication (quick guide)
Before you read on, I assume you have MySQL installed on both Server and the servers are on the same network, communicating. This guide takes you down the basic route of replicating to a single slave.
To ensure that Replication will work you need to ensure a few configuration settings are set. Check your my.cnf for the following attributes:
Master;
[mysqld]
server_id=1
log_bin = {filename} e.g. mysql-bin
Slave;
[mysqld]
server_id=10
log_bin = {filename}
relay_log = mysql-relay-bin
log_slave_updates = 1
read_only = 1
The server-id attribute needs to be a unique integer. You may incur problems if you have duplicates on your network so set this to something unused. Best practice is to ensure you set the servier-id in my.cnf on all your servers as the MySQL will default to 1 and think it's a master. I frequently see the use of the last quartet of the server's IP address. Ensure that you're using binary logging on your master too, Replication will not work without it. Make sure that the lines skip-networking and bind-address are either commented out or deleted as these will make it impossible to connect to your network. Once your my.cnf complies with Replication's needs, restart the mysql daemon;
OS command
shell$ mysqladmin -uroot -p shutdown
shell$ service mysql start
The previous OS commands will ensure graceful cycle of your daemon. Run them on your command line.
1. Authentication
At the beginning we need to create a user on the Master so that our slave's thread can access our data. Login to your master MySQL node and then add your replication user
MySQL master command
mysql> GRANT REPLICATION SLAVE ON *.* TO `slave`@`Server2` IDENTIFIED BY 'password';
This command will add a user to your Master that allows the Slave to connect.
**It's best practice that this is account is used for no other purpose then replication.**
2. Load Data
This step is optional. If you've got data in your tables already then this is a sensible step to undertake. Dump your data into your slave using the mysqldump tool;
OS master command
shell$ mysqldump -u{user} -p{password} {--single-transaction|--lock-all-tables}--all-databases --master-data=1 --host=server1 | mysql -u{user}-p{password} --host=server2
This mysqldump command will dump your data from Server1 and the pipe the output into Server2. You can also dump to a file using the *nix redirect;
OS master command
shell$ mysqldump -u{user} -p{password} {--single-transaction|--lock-all-tables} --all-databases --master-data=1 > masterdata_dump.sql
and then copy the output file to your slave and import it.
3. Configure Slave
Start a MySQL session on your slave and enter the following command. This coordinates your replication. It tells the slave how to connect to your master server.
MySQL slave command
mysql> CHANGE MASTER TO
MASTER_HOST='ServerIP/FQDN',
MASTER_USER='ReplClient',
MASTER_PASSWORD='ClientPassword',
MASTER_LOG_FILE='mysql-bin.000001',
MASTER_LOG_POS=98;
4. Start Slave
MySQL slave command
All going well you should be up and running. Check your replication using command
MySQL slave command
mysql> SHOW SLAVE STATUS;
Backing Up
Now you have an operational Replication Setup you can explore the different tools on the market for backing up your data. A robust and thorough disaster recovery policy is a gem in the crown of the DBA. Without it one will need to keep an up-to-date CV handy at all times because disasters do happen! Plan a full backup and restore strategy, consider what backup window is available to you, how much data you can afford to lose, if you need 'point in time' restoration. Check out ProductionDBA.com's MySQL Blog for a great summary on MySQL Backup/Restore tools.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:8:"MySQLBoy";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:21;a:6:{s:4:"data";s:43:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:51:"Comments on Kostja’s motivations on hacking MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:30:"http://askmonty.org/blog/?p=34";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:74:"http://askmonty.org/blog/comments-on-kostjas-motivations-on-hacking-mysql/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:7260:"Recently Kostja posted two insightful blog posts about his thoughts on the currently fragmented MySQL landscape and quality of a piece of code contributed by a “community member”, which is a MySQL euphemism for a person not employed by MySQL. (Hence, the full time MySQL developers are themselves not members of their own community?)
I wanted to comment on both posts, but found out Kostja only allows logged in LiveJournal users to comment, which I am not. Since the posts were interesting enough, I suppose they deserve a comment in a new blog post like this instead.
From “RDBMS software is difficult” (slightly reordered)
The main reason it is harder to do changes with MySQL is a larger legacy, including political and managerial, but you get into exact same situation in any project after your first release. I said that all things considered, the current MySQL trunk is perhaps as good starting point for rethinking as the current Drizzle. [...] I would not want to actually diminish importance of Drizzle (initially, I was fond of it and rather wanted to join; the reason I didn’t, I’ve just spelled out). I’d love to be proven wrong, but I don’t see it becoming such a universal piece of software that I personally would like to be contributing to.
Recent years there’s been a serious fragmentation of technical thought in MySQL ecosystem. Drizzle, MariaDB, Percona are excellent for community, but are not at all good for our ability to make MySQL a universal database platform. I mean, ability to make MySQL a database platform comparable to what Linux/Unix is nowadays to operating systems. Truth be said, I am not at all sure that my current employer, Oracle, is a good host to seek this holy grail either. Perhaps we’ll never get there, not with this project.
Kostja, you are not alone with such thoughts. I think it makes sense to separate Drizzle and other forks when one looks at the MySQL ecosystem. In my opinion, when Drizzle got started, all the good reasons for a new fork existed: Stagnated development in the original project, patches not flowing into the main trunk, not answering to new technological needs (the cloud)… at the same time, Drizzle’s approach is simply not useful in the short term. It is now 2 years since Drizzle got started. They will go into Beta this Summer, and even their first release is not even aiming for addressing the entire MySQL space.
This means that even in a best case scenario for Drizzle, short term it simply wasn’t realistic that all MySQL developers would have joined it. MySQL has a large install base of servers currently in production. You can not turn your back to that, on the contrary, your best bet for a universal database of course is always the one who already has so many users. Even so, I think there was good reasons for a small group of developers forking Drizzle. This has the cost they are essentially away from MySQL/MariaDB development, except the friendly support we still give to each other.
So to your thoughts on this, I just wanted to say this is exactly the same reason I work for MariaDB and not Drizzle. If Drizzle one day “gets there” I will not hesitate to redirect my energy when the time is right, but for the time being, this is the reasons I work for MariaDB.
As for all the other forks, which remain more or less compatible with the original MySQL code base, the situation is different. It is mostly a result of how MySQL was organized: on the outside, even if we wanted, we cannot participate in some of the MySQL infrastructure like Pushbuild, we cannot call our packages “MySQL” for trademark reasons (Percona did it first but not anymore), and MySQL will not incorporate our code into itself (when released as GPL), so we end up diverging.
So when looking at the big picture, it is a bit messy at the moment. At the same time, it is nice to see how the people in the MySQL community, whether developers or else, are all very committed to continue to work together, despite current obstacles. For instance, also myself wouldn’t trust that Oracle is the perfect steward to take MySQL forward, but I don’t think MySQL AB was near-perfect either! As long as Oracle pays you a salary and you can develop GPL code, it is up to the community as a whole to make sure there is a future for that code. Oracle is welcome to contribute – and they do – but the future of our open source database must not be dependent on what one company is doing.
Then in “How on earth is it possible to accept this” Kostja laments the low quality of a contributed patch:
Should a semi-working, semi-documented code be accepted, expecting that there will be more patches?
The answer is obviously “No”. A semi-working patch should be reviewed and feedback given, so the original developer can continue to perfect it.
Unfortunately, this was not happening in MySQL. In MariaDB 5.2 we have now pulled in quite many patches created in the MySQL community over the years. We had to spend significant effort to get them into acceptable quality. This is not how an open source project is supposed to work. (If you send a low quality patch to Linux, it’s not like Linus will hold your hand and fix it for you.) But since this kind of workflow has not been in place before, and many patches were several years old, we have considered our work on them as a “bootsrapping effort”. It didn’t feel right to go back to someone that contributed something 3 years ago and now ask them to fix a few things. Even so, we don’t intend to continue this way, we do want to turn MariaDB into a project where it is up to the contributor to finish several iterations of a patch, then we commit it.
And by the way, it is not like being employed by MySQL/Sun/Oracle magically makes you a flawless coder either. In recent merges we have started to reject some patches coming from MySQL, since they don’t pass our review, or more likely since they get caught in our automated QA (buildbot). For instance, by including some engines that MySQL doesn’t, we get broader test coverage then MySQL and sometimes catch errors that pass the MySQL process.
And to finish the loop, I suspect MariaDB developers (in particular those employed by Monty Program) are not perfect either! One of the patches we spent some effort improving before committing it was Segmented Key Cache. But now we get feedback from the original developer that the finalized patch gives him less performance boost than his original patch does. Maybe we broke something while reviewing? This is still being investigated as I write.
I just wanted to respond to this since there is a perception with many in MySQL (and I speak perhaps more of some managers than people like Kostja) that a non-employee simply couldn’t produce something useful and this community thing is just a distraction. I hope MariaDB 5.2 proves that there have been useful contributions, and I hope the future will prove there can be much more. And, no matter who you are employed with, you will produce bugs every now and then.
RDBMS software is difficultRDBMS software is difficult";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Fri, 28 May 2010 13:43:49 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:1:{i:0;a:5:{s:4:"data";s:13:"Uncategorized";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:8286:"Recently Kostja posted two insightful blog posts about his thoughts on the currently fragmented MySQL landscape and quality of a piece of code contributed by a “community member”, which is a MySQL euphemism for a person not employed by MySQL. (Hence, the full time MySQL developers are themselves not members of their own community?)
I wanted to comment on both posts, but found out Kostja only allows logged in LiveJournal users to comment, which I am not. Since the posts were interesting enough, I suppose they deserve a comment in a new blog post like this instead.
From “RDBMS software is difficult” (slightly reordered)
The main reason it is harder to do changes with MySQL is a larger legacy, including political and managerial, but you get into exact same situation in any project after your first release. I said that all things considered, the current MySQL trunk is perhaps as good starting point for rethinking as the current Drizzle. [...] I would not want to actually diminish importance of Drizzle (initially, I was fond of it and rather wanted to join; the reason I didn’t, I’ve just spelled out). I’d love to be proven wrong, but I don’t see it becoming such a universal piece of software that I personally would like to be contributing to.
Recent years there’s been a serious fragmentation of technical thought in MySQL ecosystem. Drizzle, MariaDB, Percona are excellent for community, but are not at all good for our ability to make MySQL a universal database platform. I mean, ability to make MySQL a database platform comparable to what Linux/Unix is nowadays to operating systems. Truth be said, I am not at all sure that my current employer, Oracle, is a good host to seek this holy grail either. Perhaps we’ll never get there, not with this project.
Kostja, you are not alone with such thoughts. I think it makes sense to separate Drizzle and other forks when one looks at the MySQL ecosystem. In my opinion, when Drizzle got started, all the good reasons for a new fork existed: Stagnated development in the original project, patches not flowing into the main trunk, not answering to new technological needs (the cloud)… at the same time, Drizzle’s approach is simply not useful in the short term. It is now 2 years since Drizzle got started. They will go into Beta this Summer, and even their first release is not even aiming for addressing the entire MySQL space.
This means that even in a best case scenario for Drizzle, short term it simply wasn’t realistic that all MySQL developers would have joined it. MySQL has a large install base of servers currently in production. You can not turn your back to that, on the contrary, your best bet for a universal database of course is always the one who already has so many users. Even so, I think there was good reasons for a small group of developers forking Drizzle. This has the cost they are essentially away from MySQL/MariaDB development, except the friendly support we still give to each other.
So to your thoughts on this, I just wanted to say this is exactly the same reason I work for MariaDB and not Drizzle. If Drizzle one day “gets there” I will not hesitate to redirect my energy when the time is right, but for the time being, this is the reasons I work for MariaDB.
As for all the other forks, which remain more or less compatible with the original MySQL code base, the situation is different. It is mostly a result of how MySQL was organized: on the outside, even if we wanted, we cannot participate in some of the MySQL infrastructure like Pushbuild, we cannot call our packages “MySQL” for trademark reasons (Percona did it first but not anymore), and MySQL will not incorporate our code into itself (when released as GPL), so we end up diverging.
So when looking at the big picture, it is a bit messy at the moment. At the same time, it is nice to see how the people in the MySQL community, whether developers or else, are all very committed to continue to work together, despite current obstacles. For instance, also myself wouldn’t trust that Oracle is the perfect steward to take MySQL forward, but I don’t think MySQL AB was near-perfect either! As long as Oracle pays you a salary and you can develop GPL code, it is up to the community as a whole to make sure there is a future for that code. Oracle is welcome to contribute – and they do – but the future of our open source database must not be dependent on what one company is doing.
Then in “How on earth is it possible to accept this” Kostja laments the low quality of a contributed patch:
Should a semi-working, semi-documented code be accepted, expecting that there will be more patches?
The answer is obviously “No”. A semi-working patch should be reviewed and feedback given, so the original developer can continue to perfect it.
Unfortunately, this was not happening in MySQL. In MariaDB 5.2 we have now pulled in quite many patches created in the MySQL community over the years. We had to spend significant effort to get them into acceptable quality. This is not how an open source project is supposed to work. (If you send a low quality patch to Linux, it’s not like Linus will hold your hand and fix it for you.) But since this kind of workflow has not been in place before, and many patches were several years old, we have considered our work on them as a “bootsrapping effort”. It didn’t feel right to go back to someone that contributed something 3 years ago and now ask them to fix a few things. Even so, we don’t intend to continue this way, we do want to turn MariaDB into a project where it is up to the contributor to finish several iterations of a patch, then we commit it.
And by the way, it is not like being employed by MySQL/Sun/Oracle magically makes you a flawless coder either. In recent merges we have started to reject some patches coming from MySQL, since they don’t pass our review, or more likely since they get caught in our automated QA (buildbot). For instance, by including some engines that MySQL doesn’t, we get broader test coverage then MySQL and sometimes catch errors that pass the MySQL process.
And to finish the loop, I suspect MariaDB developers (in particular those employed by Monty Program) are not perfect either! One of the patches we spent some effort improving before committing it was Segmented Key Cache. But now we get feedback from the original developer that the finalized patch gives him less performance boost than his original patch does. Maybe we broke something while reviewing? This is still being investigated as I write.
I just wanted to respond to this since there is a perception with many in MySQL (and I speak perhaps more of some managers than people like Kostja) that a non-employee simply couldn’t produce something useful and this community thing is just a distraction. I hope MariaDB 5.2 proves that there have been useful contributions, and I hope the future will prove there can be much more. And, no matter who you are employed with, you will produce bugs every now and then.
RDBMS software is difficultRDBMS software is difficult
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:11:"Henrik Ingo";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:22;a:6:{s:4:"data";s:63:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:56:"A database for everyone (comments on Sybase acquisition)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:25:"280 at http://openlife.cc";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:79:"http://openlife.cc/blogs/2010/may/database-everyone-comments-sybase-acquisition";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:346:"One thing I haven't seen anybody commenting on is the fact that with SAP acquiring Sybase, it will be the last major independent database company to be merged into a larger SW company. (To say this, you can qualify MySQL AB as a major database company, but disqualify, say, EnterpriseDB or InterBase, which imho is entirely reasonable.)
read more";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Fri, 28 May 2010 11:49:46 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:5:{i:0;a:5:{s:4:"data";s:9:"Databases";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:9:"Microsoft";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:6:"Oracle";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:6:"Sybase";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:683:"One thing I haven't seen anybody commenting on is the fact that with SAP acquiring Sybase, it will be the last major independent database company to be merged into a larger SW company. (To say this, you can qualify MySQL AB as a major database company, but disqualify, say, EnterpriseDB or InterBase, which imho is entirely reasonable.)
read more
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:11:"Henrik Ingo";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:23;a:6:{s:4:"data";s:83:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:25:"Introduction to memcached";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:36:"http://www.jurriaanpersyn.com/?p=354";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:76:"http://www.jurriaanpersyn.com/archives/2010/05/27/introduction-to-memcached/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:282:"These are the slides to a talk I did earlier this week for students of the professional bachelor in ICT course at KaHo St. Lieven. I wanted to give a clear and simple introduction to the memcached service, as I think it’s an invaluable tool in today’s web development.
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Thu, 27 May 2010 21:52:32 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:9:{i:0;a:5:{s:4:"data";s:4:"tech";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:4:"work";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:7:"caching";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:8:"ikdoeict";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:12:"invalidation";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:15:"kaho st. lieven";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:9:"memcached";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:7;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:8;a:5:{s:4:"data";s:3:"php";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:588:"These are the slides to a talk I did earlier this week for students of the professional bachelor in ICT course at KaHo St. Lieven. I wanted to give a clear and simple introduction to the memcached service, as I think it’s an invaluable tool in today’s web development.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:15:"Jurriaan Persyn";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:24;a:6:{s:4:"data";s:43:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:63:"LEFT JOIN / IS NULL vs. NOT IN vs. NOT EXISTS: nullable columns";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:34:"http://explainextended.com/?p=4793";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:97:"http://explainextended.com/2010/05/27/left-join-is-null-vs-not-in-vs-not-exists-nullable-columns/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:13320:"In one of the previous articles I discussed performance of the three methods to implement an anti-join in MySQL.
Just a quick reminder: an anti-join is an operation that returns all records from one table which share a value of a certain column with no records from another table.
In SQL, there are at least three methods to implement it:
LEFT JOIN / IS NULL
SELECT o.*
FROM outer o
LEFT JOIN
inner i
ON i.value = o.value
WHERE i.value IS NULL
NOT IN
SELECT o.*
FROM outer o
WHERE o.value NOT IN
(
SELECT value
FROM inner
)
NOT EXISTS
SELECT o.*
FROM outer o
WHERE NOT EXISTS
(
SELECT NULL
FROM inner i
WHERE i.value = o.value
)
When inner.value is marked as NOT NULL, all these queries are semantically equivalent and with proper indexing have similarly optimized execution plans in MySQL.
Now, what if inner.value is not nullable and does contain some NULL values?
Let’s create some sample tables:
Table creation details
CREATE TABLE filler (
id INT NOT NULL PRIMARY KEY AUTO_INCREMENT
) ENGINE=MyISAM;
CREATE TABLE t_inner (
id INT NOT NULL PRIMARY KEY,
val INT,
stuffing VARCHAR(200) NOT NULL,
KEY ix_inner_val (val)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
CREATE TABLE t_outer (
id INT NOT NULL PRIMARY KEY,
val INT,
stuffing VARCHAR(200) NOT NULL,
KEY ix_outer_val (val)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
DELIMITER $$
CREATE PROCEDURE prc_filler(cnt INT)
BEGIN
DECLARE _cnt INT;
SET _cnt = 1;
WHILE _cnt <= cnt DO
INSERT
INTO filler
SELECT _cnt;
SET _cnt = _cnt + 1;
END WHILE;
END
$$
DELIMITER ;
START TRANSACTION;
CALL prc_filler(1000000);
COMMIT;
INSERT
INTO t_inner
SELECT id,
NULLIF(CEILING(RAND(20100527) * 100000), 100000),
RPAD('', 200, '*')
FROM filler;
INSERT
INTO t_outer
SELECT id,
NULLIF(CEILING(RAND(20100527 << 1) * 100000), 100000),
RPAD('', 200, '*')
FROM filler;
There are two identical MyISAM tables. Each of the tables contains 1,000,000 random values from 1 to 99,999 and also some NULL values. There is an index on value in both tables.
Now, let’s check the queries.
NOT EXISTS
SELECT SUM(LENGTH(stuffing)), COUNT(*)
FROM t_outer o
WHERE NOT EXISTS
(
SELECT NULL
FROM t_inner i
WHERE i.val = o.val
)
SUM(LENGTH(stuffing))
COUNT(*)
14600
73
1 row fetched in 0.0001s (9.9061s)
id
select_type
table
type
possible_keys
key
key_len
ref
rows
filtered
Extra
1
PRIMARY
o
ALL
1000000
100.00
Using where
2
DEPENDENT SUBQUERY
i
ref
ix_inner_val
ix_inner_val
5
20100527_anti.o.val
10
100.00
Using where; Using index
Field or reference '20100527_anti.o.val' of SELECT #2 was resolved in SELECT #1
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` where (not(exists(select NULL from `20100527_anti`.`t_inner` `i` where (`20100527_anti`.`i`.`val` = `20100527_anti`.`o`.`val`))))
The query completes in 9.9 seconds. As we can see, it is optimized to use the index on t_inner.val and return on the first match.
LEFT JOIN / IS NULL
SELECT SUM(LENGTH(o.stuffing)), COUNT(*)
FROM t_outer o
LEFT JOIN
t_inner i
ON i.val = o.val
WHERE i.id IS NULL
SUM(LENGTH(o.stuffing))
COUNT(*)
14600
73
1 row fetched in 0.0001s (13.5154s)
id
select_type
table
type
possible_keys
key
key_len
ref
rows
filtered
Extra
1
SIMPLE
o
ALL
1000000
100.00
1
SIMPLE
i
ref
ix_inner_val
ix_inner_val
5
20100527_anti.o.val
10
100.00
Using where; Not exists
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(o.stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` left join `20100527_anti`.`t_inner` `i` on((`20100527_anti`.`i`.`val` = `20100527_anti`.`o`.`val`)) where isnull(`20100527_anti`.`i`.`id`)
The query semantics are the same as those of NOT EXISTS, and we even see the Not exists optimization in the plan, however this query performs much more poorly than NOT EXISTS: 13 seconds. Why?
MySQL documentation on EXPLAIN states that Not exists is used to optimize the queries similar to the one we have just run: LEFT JOIN with IS NULL predicate applied to a non-nullable column.
MySQL is aware that such a predicate can only be satisfied by a record resulting from a JOIN miss (i. e. when no matching record was found in the rightmost table) and stops reading records after first index hit.
However, this optimization is implemented in a way that is far from being perfect. Despite the fact that no actual value of id can be returned by such a query, the engine still looks up id in the table (since it’s not a part of the index). We can see it in the plan: unlike NOT EXISTS query, there is no Using index for t_inner. This means that a table lookup is performed.
Even we replace id with val in the query, it still performs poorly:
SELECT SUM(LENGTH(o.stuffing)), COUNT(*)
FROM t_outer o
LEFT JOIN
t_inner i
ON i.val = o.val
WHERE i.val IS NULL
SUM(LENGTH(o.stuffing))
COUNT(*)
14600
73
1 row fetched in 0.0001s (14.4997s)
id
select_type
table
type
possible_keys
key
key_len
ref
rows
filtered
Extra
1
SIMPLE
o
ALL
1000000
100.00
1
SIMPLE
i
ref
ix_inner_val
ix_inner_val
5
20100527_anti.o.val
10
100.00
Using where; Using index
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(o.stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` left join `20100527_anti`.`t_inner` `i` on((`20100527_anti`.`i`.`val` = `20100527_anti`.`o`.`val`)) where isnull(`20100527_anti`.`i`.`val`)
This time, no table lookups are made but there is no Not exists optimization either.
Despite the fact that the join condition eliminates possibility of an actual NULL being returned by the query and any val IS NULL reaching the WHERE clause is a result of a join miss, MySQL still examines all records in t_inner, not stopping after the first hit.
This had been submitted as a bug.
Now, what about NOT IN?
NOT IN
Unlike the previous two queries that only differ in implementation, not in semantics, NOT IN, being applied as is, would yield the different results.
NOT EXISTS and IS NULL are two-state predicates, they can only return TRUE or FALSE. NOT IN is a three-state predicate: it can return TRUE, FALSE or NULL.
NULL value is returned in two cases:
When t_outer.value being tested is NULL
When at least one of t_inner.value is NULL
This means that having but a single NULL in t_inner would prevent the query from returning anything.
Naive approach
Let’s see what happens if we just substitute NOT IN instead of NOT EXISTS:
SELECT SUM(LENGTH(stuffing)), COUNT(*)
FROM t_outer o
WHERE val NOT IN
(
SELECT val
FROM t_inner i
)
SUM(LENGTH(stuffing))
COUNT(*)
0
1 row fetched in 0.0001s (10.3748s)
id
select_type
table
type
possible_keys
key
key_len
ref
rows
filtered
Extra
1
PRIMARY
o
ALL
1000000
100.00
Using where
2
DEPENDENT SUBQUERY
i
index_subquery
ix_inner_val
ix_inner_val
5
func
20
100.00
Using index; Full scan on NULL key
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` where (not(<in_optimizer>(`20100527_anti`.`o`.`val`,<exists>(<index_lookup>(<cache>(`20100527_anti`.`o`.`val`) in t_inner on ix_inner_val checking NULL having trigcond(<is_not_null_test>(`20100527_anti`.`i`.`val`)))))))
Since there are NULLs in t_inner, no record in t_outer can satisfy the predicate.
MySQL does not optimize this very well. It takes but a single index scan to find out if there are NULL values in t_inner and return if they are, but for some reason MySQL still applies the condition to each record in t_outer.
Naive approach, improved
With a little help from our side, this can be improved:
SELECT SUM(LENGTH(stuffing)), COUNT(*)
FROM t_outer o
WHERE NOT EXISTS
(
SELECT NULL
FROM t_inner i
WHERE val IS NULL
)
AND val NOT IN
(
SELECT val
FROM t_inner i
)
SUM(LENGTH(stuffing))
COUNT(*)
0
1 row fetched in 0.0001s (0.0014s)
id
select_type
table
type
possible_keys
key
key_len
ref
rows
filtered
Extra
1
PRIMARY
Impossible WHERE
3
DEPENDENT SUBQUERY
i
index_subquery
ix_inner_val
ix_inner_val
5
func
20
100.00
Using index; Full scan on NULL key
2
SUBQUERY
i
ref
ix_inner_val
ix_inner_val
5
4
100.00
Using where; Using index
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` where 0
We added an explicit check for NULL values. Since it’s not correlated, MySQL could instantly prove it false, cache it and avoid the table scan at all.
Ignoring right side NULLs
Now, let’s make a NOT IN query that does not take the NULL values in t_inner into account:
SELECT SUM(LENGTH(stuffing)), COUNT(*)
FROM t_outer o
WHERE val NOT IN
(
SELECT val
FROM t_inner i
WHERE val IS NOT NULL
)
SUM(LENGTH(stuffing))
COUNT(*)
13400
67
1 row fetched in 0.0001s (10.4060s)
id
select_type
table
type
possible_keys
key
key_len
ref
rows
filtered
Extra
1
PRIMARY
o
ALL
1000000
100.00
Using where
2
DEPENDENT SUBQUERY
i
index_subquery
ix_inner_val
ix_inner_val
5
func
20
100.00
Using index; Using where; Full scan on NULL key
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` where (not(<in_optimizer>(`20100527_anti`.`o`.`val`,<exists>(<index_lookup>(<cache>(`20100527_anti`.`o`.`val`) in t_inner on ix_inner_val checking NULL where (`20100527_anti`.`i`.`val` is not null) having trigcond(<is_not_null_test>(`20100527_anti`.`i`.`val`)))))))
This time, the query returns records, but not as many as the previous queries did.
We made an additional check for NULL in t_inner but not in t_outer. There are some records in t_outer that have a NULL in val. Both IN and NOT IN would evaluate to NULL and WHERE would filter them out.
We see another glitch in MySQL optimizer here: a Full scan on NULL key applied. Since NOT IN should always return TRUE when the subquery returns no records (even if the value checked is a NULL), on correlated queries a fullscan should be applied to check for the records and find out whether to return NULL or FALSE. However, in this case the IN subquery is not correlated, so the check could only be performed once and cached, like with the LEFT JOIN.
In our case the overhead would be negligible, since the subquery would return on the first match, but it could matter if we had more NULL values in t_outer.
Now, what if we want NULL records on t_outer to be returned as well? We just need to add an additional check for NULLs.
Ignoring all NULLs
SELECT SUM(LENGTH(stuffing)), COUNT(*)
FROM t_outer o
WHERE val IS NULL
OR val NOT IN
(
SELECT val
FROM t_inner i
WHERE val IS NOT NULL
)
SUM(LENGTH(stuffing))
COUNT(*)
14600
73
1 row fetched in 0.0002s (10.4842s)
id
select_type
table
type
possible_keys
key
key_len
ref
rows
filtered
Extra
1
PRIMARY
o
ALL
ix_outer_val
1000000
100.00
Using where
2
DEPENDENT SUBQUERY
i
index_subquery
ix_inner_val
ix_inner_val
5
func
20
100.00
Using index; Using where; Full scan on NULL key
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` where (isnull(`20100527_anti`.`o`.`val`) or (not(<in_optimizer>(`20100527_anti`.`o`.`val`,<exists>(<index_lookup>(<cache>(`20100527_anti`.`o`.`val`) in t_inner on ix_inner_val checking NULL where (`20100527_anti`.`i`.`val` is not null) having trigcond(<is_not_null_test>(`20100527_anti`.`i`.`val`))))))))
Here, the query returns the same results as NOT EXISTS.
Full scan on NULL key is still present in the plan but will never actually be executed because it will be short circuited by the previous IS NULL check.
Summary
As was shown in the earlier article, LEFT JOIN / IS NULL and NOT IN are best used to implement an anti-join in MySQL if the columns on both sides are not nullable.
The situation is different when the columns are nullable:
NOT EXISTS performs in most straightforward way: just checks equality and returns TRUE or FALSE on the first hit / miss.
LEFT JOIN / IS NULL either makes an additional table lookup or does not return on the first match and performs more poorly in both cases.
NOT IN, having different semantics, requires additional checks for NULL values. These checks should be coded into the query
With nullable columns, NOT EXISTS and NOT IN (with additional checks for NULLS) are the most efficient methods to implement an anti-join in MySQL.
LEFT JOIN / IS NULL performs poorly.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Thu, 27 May 2010 19:00:15 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:1:{i:0;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:19494:"In one of the previous articles I discussed performance of the three methods to implement an anti-join in MySQL.
Just a quick reminder: an anti-join is an operation that returns all records from one table which share a value of a certain column with no records from another table.
In SQL, there are at least three methods to implement it:
LEFT JOIN / IS NULL
SELECT o.*
FROM outer o
LEFT JOIN
inner i
ON i.value = o.value
WHERE i.value IS NULL
NOT IN
SELECT o.*
FROM outer o
WHERE o.value NOT IN
(
SELECT value
FROM inner
)
NOT EXISTS
SELECT o.*
FROM outer o
WHERE NOT EXISTS
(
SELECT NULL
FROM inner i
WHERE i.value = o.value
)
When inner.value is marked as NOT NULL, all these queries are semantically equivalent and with proper indexing have similarly optimized execution plans in MySQL.
Now, what if inner.value is not nullable and does contain some NULL values?
Let’s create some sample tables:
Table creation details
CREATE TABLE filler (
id INT NOT NULL PRIMARY KEY AUTO_INCREMENT
) ENGINE=MyISAM;
CREATE TABLE t_inner (
id INT NOT NULL PRIMARY KEY,
val INT,
stuffing VARCHAR(200) NOT NULL,
KEY ix_inner_val (val)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
CREATE TABLE t_outer (
id INT NOT NULL PRIMARY KEY,
val INT,
stuffing VARCHAR(200) NOT NULL,
KEY ix_outer_val (val)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
DELIMITER $$
CREATE PROCEDURE prc_filler(cnt INT)
BEGIN
DECLARE _cnt INT;
SET _cnt = 1;
WHILE _cnt <= cnt DO
INSERT
INTO filler
SELECT _cnt;
SET _cnt = _cnt + 1;
END WHILE;
END
$$
DELIMITER ;
START TRANSACTION;
CALL prc_filler(1000000);
COMMIT;
INSERT
INTO t_inner
SELECT id,
NULLIF(CEILING(RAND(20100527) * 100000), 100000),
RPAD('', 200, '*')
FROM filler;
INSERT
INTO t_outer
SELECT id,
NULLIF(CEILING(RAND(20100527 << 1) * 100000), 100000),
RPAD('', 200, '*')
FROM filler;
There are two identical MyISAM tables. Each of the tables contains 1,000,000 random values from 1 to 99,999 and also some NULL values. There is an index on value in both tables.
Now, let’s check the queries.
NOT EXISTS
SELECT SUM(LENGTH(stuffing)), COUNT(*)
FROM t_outer o
WHERE NOT EXISTS
(
SELECT NULL
FROM t_inner i
WHERE i.val = o.val
)
| SUM(LENGTH(stuffing)) |
COUNT(*) |
| 14600 |
73 |
| 1 row fetched in 0.0001s (9.9061s) |
| id |
select_type |
table |
type |
possible_keys |
key |
key_len |
ref |
rows |
filtered |
Extra |
| 1 |
PRIMARY |
o |
ALL |
|
|
|
|
1000000 |
100.00 |
Using where |
| 2 |
DEPENDENT SUBQUERY |
i |
ref |
ix_inner_val |
ix_inner_val |
5 |
20100527_anti.o.val |
10 |
100.00 |
Using where; Using index |
Field or reference '20100527_anti.o.val' of SELECT #2 was resolved in SELECT #1
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` where (not(exists(select NULL from `20100527_anti`.`t_inner` `i` where (`20100527_anti`.`i`.`val` = `20100527_anti`.`o`.`val`))))
The query completes in 9.9 seconds. As we can see, it is optimized to use the index on t_inner.val and return on the first match.
LEFT JOIN / IS NULL
SELECT SUM(LENGTH(o.stuffing)), COUNT(*)
FROM t_outer o
LEFT JOIN
t_inner i
ON i.val = o.val
WHERE i.id IS NULL
| SUM(LENGTH(o.stuffing)) |
COUNT(*) |
| 14600 |
73 |
| 1 row fetched in 0.0001s (13.5154s) |
| id |
select_type |
table |
type |
possible_keys |
key |
key_len |
ref |
rows |
filtered |
Extra |
| 1 |
SIMPLE |
o |
ALL |
|
|
|
|
1000000 |
100.00 |
|
| 1 |
SIMPLE |
i |
ref |
ix_inner_val |
ix_inner_val |
5 |
20100527_anti.o.val |
10 |
100.00 |
Using where; Not exists |
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(o.stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` left join `20100527_anti`.`t_inner` `i` on((`20100527_anti`.`i`.`val` = `20100527_anti`.`o`.`val`)) where isnull(`20100527_anti`.`i`.`id`)
The query semantics are the same as those of NOT EXISTS, and we even see the Not exists optimization in the plan, however this query performs much more poorly than NOT EXISTS: 13 seconds. Why?
MySQL documentation on EXPLAIN states that Not exists is used to optimize the queries similar to the one we have just run: LEFT JOIN with IS NULL predicate applied to a non-nullable column.
MySQL is aware that such a predicate can only be satisfied by a record resulting from a JOIN miss (i. e. when no matching record was found in the rightmost table) and stops reading records after first index hit.
However, this optimization is implemented in a way that is far from being perfect. Despite the fact that no actual value of id can be returned by such a query, the engine still looks up id in the table (since it’s not a part of the index). We can see it in the plan: unlike NOT EXISTS query, there is no Using index for t_inner. This means that a table lookup is performed.
Even we replace id with val in the query, it still performs poorly:
SELECT SUM(LENGTH(o.stuffing)), COUNT(*)
FROM t_outer o
LEFT JOIN
t_inner i
ON i.val = o.val
WHERE i.val IS NULL
| SUM(LENGTH(o.stuffing)) |
COUNT(*) |
| 14600 |
73 |
| 1 row fetched in 0.0001s (14.4997s) |
| id |
select_type |
table |
type |
possible_keys |
key |
key_len |
ref |
rows |
filtered |
Extra |
| 1 |
SIMPLE |
o |
ALL |
|
|
|
|
1000000 |
100.00 |
|
| 1 |
SIMPLE |
i |
ref |
ix_inner_val |
ix_inner_val |
5 |
20100527_anti.o.val |
10 |
100.00 |
Using where; Using index |
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(o.stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` left join `20100527_anti`.`t_inner` `i` on((`20100527_anti`.`i`.`val` = `20100527_anti`.`o`.`val`)) where isnull(`20100527_anti`.`i`.`val`)
This time, no table lookups are made but there is no Not exists optimization either.
Despite the fact that the join condition eliminates possibility of an actual NULL being returned by the query and any val IS NULL reaching the WHERE clause is a result of a join miss, MySQL still examines all records in t_inner, not stopping after the first hit.
This had been submitted as a bug.
Now, what about NOT IN?
NOT IN
Unlike the previous two queries that only differ in implementation, not in semantics, NOT IN, being applied as is, would yield the different results.
NOT EXISTS and IS NULL are two-state predicates, they can only return TRUE or FALSE. NOT IN is a three-state predicate: it can return TRUE, FALSE or NULL.
NULL value is returned in two cases:
- When
t_outer.value being tested is NULL
- When at least one of
t_inner.value is NULL
This means that having but a single NULL in t_inner would prevent the query from returning anything.
Naive approach
Let’s see what happens if we just substitute NOT IN instead of NOT EXISTS:
SELECT SUM(LENGTH(stuffing)), COUNT(*)
FROM t_outer o
WHERE val NOT IN
(
SELECT val
FROM t_inner i
)
| SUM(LENGTH(stuffing)) |
COUNT(*) |
|
0 |
| 1 row fetched in 0.0001s (10.3748s) |
| id |
select_type |
table |
type |
possible_keys |
key |
key_len |
ref |
rows |
filtered |
Extra |
| 1 |
PRIMARY |
o |
ALL |
|
|
|
|
1000000 |
100.00 |
Using where |
| 2 |
DEPENDENT SUBQUERY |
i |
index_subquery |
ix_inner_val |
ix_inner_val |
5 |
func |
20 |
100.00 |
Using index; Full scan on NULL key |
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` where (not(<in_optimizer>(`20100527_anti`.`o`.`val`,<exists>(<index_lookup>(<cache>(`20100527_anti`.`o`.`val`) in t_inner on ix_inner_val checking NULL having trigcond(<is_not_null_test>(`20100527_anti`.`i`.`val`)))))))
Since there are NULLs in t_inner, no record in t_outer can satisfy the predicate.
MySQL does not optimize this very well. It takes but a single index scan to find out if there are NULL values in t_inner and return if they are, but for some reason MySQL still applies the condition to each record in t_outer.
Naive approach, improved
With a little help from our side, this can be improved:
SELECT SUM(LENGTH(stuffing)), COUNT(*)
FROM t_outer o
WHERE NOT EXISTS
(
SELECT NULL
FROM t_inner i
WHERE val IS NULL
)
AND val NOT IN
(
SELECT val
FROM t_inner i
)
| SUM(LENGTH(stuffing)) |
COUNT(*) |
|
0 |
| 1 row fetched in 0.0001s (0.0014s) |
| id |
select_type |
table |
type |
possible_keys |
key |
key_len |
ref |
rows |
filtered |
Extra |
| 1 |
PRIMARY |
|
|
|
|
|
|
|
|
Impossible WHERE |
| 3 |
DEPENDENT SUBQUERY |
i |
index_subquery |
ix_inner_val |
ix_inner_val |
5 |
func |
20 |
100.00 |
Using index; Full scan on NULL key |
| 2 |
SUBQUERY |
i |
ref |
ix_inner_val |
ix_inner_val |
5 |
|
4 |
100.00 |
Using where; Using index |
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` where 0
We added an explicit check for NULL values. Since it’s not correlated, MySQL could instantly prove it false, cache it and avoid the table scan at all.
Ignoring right side NULLs
Now, let’s make a NOT IN query that does not take the NULL values in t_inner into account:
SELECT SUM(LENGTH(stuffing)), COUNT(*)
FROM t_outer o
WHERE val NOT IN
(
SELECT val
FROM t_inner i
WHERE val IS NOT NULL
)
| SUM(LENGTH(stuffing)) |
COUNT(*) |
| 13400 |
67 |
| 1 row fetched in 0.0001s (10.4060s) |
| id |
select_type |
table |
type |
possible_keys |
key |
key_len |
ref |
rows |
filtered |
Extra |
| 1 |
PRIMARY |
o |
ALL |
|
|
|
|
1000000 |
100.00 |
Using where |
| 2 |
DEPENDENT SUBQUERY |
i |
index_subquery |
ix_inner_val |
ix_inner_val |
5 |
func |
20 |
100.00 |
Using index; Using where; Full scan on NULL key |
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` where (not(<in_optimizer>(`20100527_anti`.`o`.`val`,<exists>(<index_lookup>(<cache>(`20100527_anti`.`o`.`val`) in t_inner on ix_inner_val checking NULL where (`20100527_anti`.`i`.`val` is not null) having trigcond(<is_not_null_test>(`20100527_anti`.`i`.`val`)))))))
This time, the query returns records, but not as many as the previous queries did.
We made an additional check for NULL in t_inner but not in t_outer. There are some records in t_outer that have a NULL in val. Both IN and NOT IN would evaluate to NULL and WHERE would filter them out.
We see another glitch in MySQL optimizer here: a Full scan on NULL key applied. Since NOT IN should always return TRUE when the subquery returns no records (even if the value checked is a NULL), on correlated queries a fullscan should be applied to check for the records and find out whether to return NULL or FALSE. However, in this case the IN subquery is not correlated, so the check could only be performed once and cached, like with the LEFT JOIN.
In our case the overhead would be negligible, since the subquery would return on the first match, but it could matter if we had more NULL values in t_outer.
Now, what if we want NULL records on t_outer to be returned as well? We just need to add an additional check for NULLs.
Ignoring all NULLs
SELECT SUM(LENGTH(stuffing)), COUNT(*)
FROM t_outer o
WHERE val IS NULL
OR val NOT IN
(
SELECT val
FROM t_inner i
WHERE val IS NOT NULL
)
| SUM(LENGTH(stuffing)) |
COUNT(*) |
| 14600 |
73 |
| 1 row fetched in 0.0002s (10.4842s) |
| id |
select_type |
table |
type |
possible_keys |
key |
key_len |
ref |
rows |
filtered |
Extra |
| 1 |
PRIMARY |
o |
ALL |
ix_outer_val |
|
|
|
1000000 |
100.00 |
Using where |
| 2 |
DEPENDENT SUBQUERY |
i |
index_subquery |
ix_inner_val |
ix_inner_val |
5 |
func |
20 |
100.00 |
Using index; Using where; Full scan on NULL key |
select sum(length(`20100527_anti`.`o`.`stuffing`)) AS `SUM(LENGTH(stuffing))`,count(0) AS `COUNT(*)` from `20100527_anti`.`t_outer` `o` where (isnull(`20100527_anti`.`o`.`val`) or (not(<in_optimizer>(`20100527_anti`.`o`.`val`,<exists>(<index_lookup>(<cache>(`20100527_anti`.`o`.`val`) in t_inner on ix_inner_val checking NULL where (`20100527_anti`.`i`.`val` is not null) having trigcond(<is_not_null_test>(`20100527_anti`.`i`.`val`))))))))
Here, the query returns the same results as NOT EXISTS.
Full scan on NULL key is still present in the plan but will never actually be executed because it will be short circuited by the previous IS NULL check.
Summary
As was shown in the earlier article, LEFT JOIN / IS NULL and NOT IN are best used to implement an anti-join in MySQL if the columns on both sides are not nullable.
The situation is different when the columns are nullable:
NOT EXISTS performs in most straightforward way: just checks equality and returns TRUE or FALSE on the first hit / miss.
LEFT JOIN / IS NULL either makes an additional table lookup or does not return on the first match and performs more poorly in both cases.
NOT IN, having different semantics, requires additional checks for NULL values. These checks should be coded into the query
With nullable columns, NOT EXISTS and NOT IN (with additional checks for NULLS) are the most efficient methods to implement an anti-join in MySQL.
LEFT JOIN / IS NULL performs poorly.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:23:"Alex Bolenok (Quassnoi)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:25;a:6:{s:4:"data";s:53:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:10:"NDB ENGINE";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:34:"http://dbperf.wordpress.com/?p=114";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:62:"http://dbperf.wordpress.com/2010/05/27/ndb-engine-limitations/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:198:"For latest developments and updates on cluster visit below.
Many limitations of previous versions addressed.
http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-development-5-1-ndb-7-1.html
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Thu, 27 May 2010 16:15:04 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:3:{i:0;a:5:{s:4:"data";s:23:"Database Configurations";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:15:"NDB Limitations";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:18:"NDB Storage Engine";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:1651:"For latest developments and updates on cluster visit below.
Many limitations of previous versions addressed.
http://dev.mysql.com/doc/refman/5.1/en/mysql-cluster-development-5-1-ndb-7-1.html

PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:16:"Anirudh Tamsekar";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:26;a:6:{s:4:"data";s:48:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:58:"InfiniDB Alpha 1.1.2 on 64-bit Windows and MySQL Workbench";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:69:"tag:blogger.com,1999:blog-8575059197193667898.post-798082744101239488";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:81:"http://dave-stokes.blogspot.com/2010/05/infinidb-alpha-112-on-64-bit-windows.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:766:"This week has been rough on my computers and I had to reinstall Vista on my main Windows system. But being a 'glass half full' guy, I took it as an opportunity to load the new alphas for both InfiniDB and MySQl Workbench.I downloaded the InfiniDB software from http://infinidb.org/downloads/cat_view/40-binary-release/137-112-alpha-binary-releases and Workbench from http://dev.mysql.com/downloads/workbench/ respectively. In stalled them in that order and started the database and then Workbench.I was expecting to have to fiddle with configuring one or the other to get them to work together. But I received a delightful surprise. Both worked out of the chute with no customization. Congratulations to the folks writing the install scripts at both companies!";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Thu, 27 May 2010 15:58:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:2:{i:0;a:5:{s:4:"data";s:15:"MySQL Workbench";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:8:"InfiniDB";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:1167:"This week has been rough on my computers and I had to reinstall Vista on my main Windows system. But being a 'glass half full' guy, I took it as an opportunity to load the new alphas for both InfiniDB and MySQl Workbench.
I downloaded the InfiniDB software from http://infinidb.org/downloads/cat_view/40-binary-release/137-112-alpha-binary-releases and Workbench from http://dev.mysql.com/downloads/workbench/ respectively. In stalled them in that order and started the database and then Workbench.
I was expecting to have to fiddle with configuring one or the other to get them to work together. But I received a delightful surprise. Both worked out of the chute with no customization. Congratulations to the folks writing the install scripts at both companies!
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:11:"Dave Stokes";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:27;a:6:{s:4:"data";s:53:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:53:"Book on Finnish startups includes chapter on MySQL AB";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:25:"279 at http://openlife.cc";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:81:"http://openlife.cc/blogs/2010/may/book-finnish-startups-includes-chapter-mysql-ab";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:1188:"Tekes, a Finnish government agency funding R&D in Technology and Innovation (including MariaDB) has recently published a book on Finnish startups, (PDF), which contains a whole chapter on MySQL AB.
It seems to be a well researched chapter and references many past interviews over the years, as well as being based on interviews of at least Mårten, Monty and Kevin Harvey of Benchmark. This is the most comprehensive narrative I've ever seen of items like "InnoDB Friday", a phrase I thought until now was company confidential, since talking about it would have revealed there was something negative about the day Oracle bought InnoDB (no kidding?). It also reveals what MySQL (AB) thought about the fact that PostgreSQL at one time was more popular than MySQL in one country in the world: Japan, or how much it raised VC capital. On the other hand it still only mentions some issues anonymously or only between the lines and reader is left guessing whether he should fill in "Oracle", "SAP" or something else in the gaps. (And I'm too much a coward to blog the right answers... Ok, so Google will tell you Oracle is the one who tried to acquire MySQL several times before.)
read more";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Thu, 27 May 2010 10:16:49 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:3:{i:0;a:5:{s:4:"data";s:15:"Business models";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:6:"Oracle";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:1896:"Tekes, a Finnish government agency funding R&D in Technology and Innovation (including MariaDB) has recently published a book on Finnish startups, (PDF), which contains a whole chapter on MySQL AB.
It seems to be a well researched chapter and references many past interviews over the years, as well as being based on interviews of at least Mårten, Monty and Kevin Harvey of Benchmark. This is the most comprehensive narrative I've ever seen of items like "InnoDB Friday", a phrase I thought until now was company confidential, since talking about it would have revealed there was something negative about the day Oracle bought InnoDB (no kidding?). It also reveals what MySQL (AB) thought about the fact that PostgreSQL at one time was more popular than MySQL in one country in the world: Japan, or how much it raised VC capital. On the other hand it still only mentions some issues anonymously or only between the lines and reader is left guessing whether he should fill in "Oracle", "SAP" or something else in the gaps. (And I'm too much a coward to blog the right answers... Ok, so Google will tell you Oracle is the one who tried to acquire MySQL several times before.)
read more
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:11:"Henrik Ingo";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:28;a:6:{s:4:"data";s:53:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:54:"MySQL Connector/Python 0.1.5 release: critical bug fix";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:70:"tag:blogger.com,1999:blog-7603704315097619422.post-5572935448913790756";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:75:"http://geert.vanderkelen.org/2010/05/mysql-connectorpython-015-release.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:843:"We just released MySQL Connector/Python 0.1.5 which includes a critical bug fix. It was impossible to read big result sets. The files for 0.1.4-release have been removed.You can download MySQL Connector/Python from Launchpad.Highlights:It was impossible to retrieve big result sets. (bug lp:551533 and lp:586003)
Changing copyright from Sun to Oracle (also fixing silly typo)
A very Big Thanks goes to the reporters of bug lp:551533 and lp:586003. Apologies for not being able to reproduce the bug earlier, before releasing 0.1.4.About MySQL Connector/Python: MySQL Connector/Python is implementing the MySQL Client/Server protocol completely in Python. No MySQL libraries are needed, and no compilation is necessary to run this Python DB API v2.0 compliant driver. It is compatible with Python v2.5 and later as well as Python v3.1 and later.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Thu, 27 May 2010 09:50:24 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:3:{i:0;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:6:"python";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:8:"myconnpy";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:1529:"We just released MySQL Connector/Python 0.1.5 which includes a critical bug fix. It was impossible to read big result sets. The files for 0.1.4-release have been removed.
You can download MySQL Connector/Python from Launchpad.
Highlights:
- It was impossible to retrieve big result sets. (bug lp:551533 and lp:586003)
- Changing copyright from Sun to Oracle (also fixing silly typo)
A very Big Thanks goes to the reporters of bug lp:551533 and lp:586003. Apologies for not being able to reproduce the bug earlier, before releasing 0.1.4.
About MySQL Connector/Python: MySQL Connector/Python is implementing the MySQL Client/Server protocol completely in Python. No MySQL libraries are needed, and no compilation is necessary to run this Python DB API v2.0 compliant driver. It is compatible with Python v2.5 and later as well as Python v3.1 and later.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:17:"Geert Vanderkelen";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:29;a:6:{s:4:"data";s:83:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:41:"Unqualified COUNT(*) speed PBXT vs InnoDB";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:33:"http://openquery.com/blog/?p=1261";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:64:"http://openquery.com/blog/unqualified-count-speed-pbxt-vs-innodb";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:2153:"So this is about a SELECT COUNT(*) FROM tblname without a WHERE clause. MyISAM has an optimisation for that since it maintains a rowcount for each table. InnoDB and PBXT can’t do that (at least not easily) because of their multi-versioned nature… different transactions may see a different number of rows for the table table!
So, it’s kinda known but nevertheless often ignored that this operation on InnoDB is costly in terms of time; what InnoDB has to do to figure out the exact number of rows is scan the primary key and just tally. Of course it’s faster if it doesn’t have to read a lot of the blocks from disk (i.e. smaller dataset or a large enough buffer pool).
I was curious about PBXT’s performance on this, and behold it appears to be quite a bit faster! For a table with 50 million rows, PBXT took about 20 minutes whereas the same table in InnoDB took 30 minutes. Interesting!
From those numbers [addendum: yes I do realise there's something else wrong on that server to take that long, but it'd be slow regardless] you can tell that doing the query at all is not an efficient thing to do, and definitely not something a frontend web page should be doing. Usually you just need a ballpark figure so running the query in a cron job and putting the value into memcached (or just an include file) will work well in such cases.
If you do use a WHERE clause, all engines (including MyISAM) are in the same boat… they might be able to use an index to filter on the conditions – but the bigger the table, the more work it is for the engine. PBXT being faster than InnoDB for this task makes it potentially interesting for reporting purposes as well, where otherwise you might consider using MyISAM – we generally recommend using a separate reporting slave with particular settings anyway (fewer connections but larger session-specific buffers), but it’s good to have extra choices for the task.
(In case you didn’t know, it’s ok for a slave to use a different engine from a master – so you can really make use of that ability for specialised tasks such as reporting.)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Thu, 27 May 2010 04:54:47 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:9:{i:0;a:5:{s:4:"data";s:28:"Good practice / Bad practice";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"COUNT";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:10:"index scan";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:6:"InnoDB";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:7:"mariadb";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:6:"MyISAM";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:7;a:5:{s:4:"data";s:4:"pbxt";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:8;a:5:{s:4:"data";s:9:"reporting";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:2451:"So this is about a SELECT COUNT(*) FROM tblname without a WHERE clause. MyISAM has an optimisation for that since it maintains a rowcount for each table. InnoDB and PBXT can’t do that (at least not easily) because of their multi-versioned nature… different transactions may see a different number of rows for the table table!
So, it’s kinda known but nevertheless often ignored that this operation on InnoDB is costly in terms of time; what InnoDB has to do to figure out the exact number of rows is scan the primary key and just tally. Of course it’s faster if it doesn’t have to read a lot of the blocks from disk (i.e. smaller dataset or a large enough buffer pool).
I was curious about PBXT’s performance on this, and behold it appears to be quite a bit faster! For a table with 50 million rows, PBXT took about 20 minutes whereas the same table in InnoDB took 30 minutes. Interesting!
From those numbers [addendum: yes I do realise there's something else wrong on that server to take that long, but it'd be slow regardless] you can tell that doing the query at all is not an efficient thing to do, and definitely not something a frontend web page should be doing. Usually you just need a ballpark figure so running the query in a cron job and putting the value into memcached (or just an include file) will work well in such cases.
If you do use a WHERE clause, all engines (including MyISAM) are in the same boat… they might be able to use an index to filter on the conditions – but the bigger the table, the more work it is for the engine. PBXT being faster than InnoDB for this task makes it potentially interesting for reporting purposes as well, where otherwise you might consider using MyISAM – we generally recommend using a separate reporting slave with particular settings anyway (fewer connections but larger session-specific buffers), but it’s good to have extra choices for the task.
(In case you didn’t know, it’s ok for a slave to use a different engine from a master – so you can really make use of that ability for specialised tasks such as reporting.)
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:10:"Open Query";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:30;a:6:{s:4:"data";s:73:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:40:"PBXT early impressions in production use";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:33:"http://openquery.com/blog/?p=1257";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:59:"http://openquery.com/blog/pbxt-early-impressions-production";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:2630:"With Paul McCullagh’s PBXT storage engine getting integrated into MariaDB 5.1, it’s never been easier to it out. So we have, on a slave off one of our own production systems which gets lots of inserts from our Zabbix monitoring system.
That’s possibly an ideal usage profile, since PBXT is a log based engine (simplistically stated, it indexes its transaction logs, rather than rewriting data from log into index and indexing that) so it should require less disk I/O than say InnoDB. And that means it should be particularly suited to for instance logging, which have lots of inserts on a sustained basis. Note that for short insert burst you may not see a difference with InnoDB because of caching, but sustain it and then you can notice.
Because PBXT has such different/distinct architecture there’s a lot of learning involved. Together with Paul and help from Roland Bouman we also created a stored procedure that can calculate the optimal average row size for PBXT, and even ALTER TABLE statements you can paste to convert tables. The AVG_ROW_LENGTH option is quite critical with PBXT, if set too big (or if you let PBXT guess and it gets it wrong) it’ll eat heaps more diskspace as well as being much slower, and if too small it’ll be slower also; this, it needs to be in the right ballpark. For existing datasets it can be calculated, so that’s what we’ve worked on. The procs will be published shortly, and Paul will also put them in with the rest of the PBXT files.
Another important aspect for PBXT is having sufficient cache memory allocated, otherwise operations can take much much longer. While the exact “cause” is different, one would notice similar performance aspects when using InnoDB on larger datasets and buffers that are too small for the purpose.
So, while using or converting some tables to PBXT takes a bit of consideration, effort and learning, it appears to be dealing with the real world very well so far – and that’s a testament to Paul’s experience. Paul is also very responsive to questions. As we gain more experience, it is our intent to try PBXT for some of our clients that have operational needs that might be a particularly good fit for PBXT.
I should also mention that it is possible to have a consistent transaction between PBXT, InnoDB and the binary log, because of the 2-phase commit (XA) infrastructure. This means that you should even be able to do a mysqldump with –single-transaction if you have both PBXT and InnoDB tables, and acquire a consistent snapshot!
More experiences and details to come.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Thu, 27 May 2010 02:03:19 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:7:{i:0;a:5:{s:4:"data";s:18:"Software and tools";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:6:"InnoDB";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:7:"mariadb";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:4:"pbxt";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:14:"storage engine";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:2:"XA";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:2993:"With Paul McCullagh’s PBXT storage engine getting integrated into MariaDB 5.1, it’s never been easier to it out. So we have, on a slave off one of our own production systems which gets lots of inserts from our Zabbix monitoring system.
That’s possibly an ideal usage profile, since PBXT is a log based engine (simplistically stated, it indexes its transaction logs, rather than rewriting data from log into index and indexing that) so it should require less disk I/O than say InnoDB. And that means it should be particularly suited to for instance logging, which have lots of inserts on a sustained basis. Note that for short insert burst you may not see a difference with InnoDB because of caching, but sustain it and then you can notice.
Because PBXT has such different/distinct architecture there’s a lot of learning involved. Together with Paul and help from Roland Bouman we also created a stored procedure that can calculate the optimal average row size for PBXT, and even ALTER TABLE statements you can paste to convert tables. The AVG_ROW_LENGTH option is quite critical with PBXT, if set too big (or if you let PBXT guess and it gets it wrong) it’ll eat heaps more diskspace as well as being much slower, and if too small it’ll be slower also; this, it needs to be in the right ballpark. For existing datasets it can be calculated, so that’s what we’ve worked on. The procs will be published shortly, and Paul will also put them in with the rest of the PBXT files.
Another important aspect for PBXT is having sufficient cache memory allocated, otherwise operations can take much much longer. While the exact “cause” is different, one would notice similar performance aspects when using InnoDB on larger datasets and buffers that are too small for the purpose.
So, while using or converting some tables to PBXT takes a bit of consideration, effort and learning, it appears to be dealing with the real world very well so far – and that’s a testament to Paul’s experience. Paul is also very responsive to questions. As we gain more experience, it is our intent to try PBXT for some of our clients that have operational needs that might be a particularly good fit for PBXT.
I should also mention that it is possible to have a consistent transaction between PBXT, InnoDB and the binary log, because of the 2-phase commit (XA) infrastructure. This means that you should even be able to do a mysqldump with –single-transaction if you have both PBXT and InnoDB tables, and acquire a consistent snapshot!
More experiences and details to come.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:10:"Open Query";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:31;a:6:{s:4:"data";s:58:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:45:"Reacting to small variations in response time";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:33:"http://www.xaprb.com/blog/?p=1879";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:83:"http://www.xaprb.com/blog/2010/05/26/reacting-to-small-variations-in-response-time/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:14921:"I wrote recently about early detection for MySQL performance problems. If your server is having micro-fluctuations in performance, it’s important to know, because very soon they will turn much worse. What can you do about this?
The most important thing is not to guess at what’s happening, but to measure instead. I have seen these problems from DNS, the binary log, failing hardware, the query cache, the table cache, the thread cache, and a variety of InnoDB edge cases. Guessing at the problem is very dangerous; you need diagnostic data. But it is often quite hard to catch a problem in action when you can only observe it in hindsight, and it happens only for a few seconds once or twice a week. This blog post is about how to detect small variations in performance, especially when it is most difficult to observe them.
Sometimes it’s actually quite easy, so let’s look at the easy cases first. Over time I have built up a collection of tricks and tools for catching a problem in action. The process of catching and diagnosing a lightning-fast performance problem looks like the following:
Determine the symptoms of the problem.
Determine how to observe the symptoms reliably and quickly.
Determine how to gather diagnostic data for later.
Set up tools or processes to do the above.
Sift the collected data and diagnose.
This is 95% about figuring out how to observe the problem and gathering the data, and 5% about actually diagnosing. If you don’t get the 95% right, you’ll gather too little or too much data, or you’ll capture it at the wrong time. Your job is hard enough; you won’t be successful if you simply gather gigs of data for weeks at a time. You need to be as precise as you can. Here are some examples:
There are normally very few connections to the server, but sometimes I start getting “Error: max_connections exceeded” or similar. Solution: observe Threads_connected from SHOW GLOBAL STATUS and react when it grows too large, or when you cannot log in to query Threads_connected.
There are normally very few queries running, but we have a connection pool (and thus Threads_connected is constant). During the freezes, hundreds of queries show up in SHOW PROCESSLIST. Solution: observe Threads_running from SHOW GLOBAL STATUS and react when it grows too large.
You get the point. Find a simple metric and figure out how to capture it — usually this is possible with a little bit of bash, awk, and grep. You might need to look for something specific in SHOW INNODB STATUS, for example, such as a large number of transactions in LOCK WAIT status.
But sometimes it’s much harder. What if you simply can’t observe the problem internally to MySQL? This does happen, especially when nothing changes except for response time. This was the case in the customer’s system that I discussed in the “predicting performance problems” blog post linked above. Every single metric provided by MySQL itself stayed constant during the mini-freezes. The problem is that you can’t get information on response time from within MySQL.
I ended up writing tools to help with this, of course. I’ll show the results below.
If fluctuations in response time are the problem, then the way to observe it is to measure response time. This requires some care, because you don’t want false positives, and a lot of my ideas were obviously vulnerable to false positives. I could cross them off right away. I can’t trigger on unusually large or small numbers of queries, for example, because those just happen as the workload fluctuates through the day, and random user behavior naturally introduces variations too.
I ended up writing a tool to tail the slow query log file, which I set to zero so it captured all queries with microsecond precision. Once per second, MySQL writes out the current timestamp to the log file, so when I see that marker, I know that a second’s worth of queries has passed by. I aggregate the last second’s worth of queries (count, total time, average time) and print out a line.
This in itself does not provide a good way to know when something unusual is happening, but it gives the foundation for it. I took it slightly further: I kept a sliding window of the last 60 1-second averages, and took the standard deviation of those. If the current second’s average response time deviates significantly from the average response time over the last 60 seconds, then something is wrong. “Significant” is pretty easy to measure with standard deviations, so that’s where the real magic comes in. Let’s see some samples of this.
First, here’s a bit of the slow query log chopped into 1-second segments and aggregated:
Time Total Count Avg 1-Min Avg 1-Min StDev Sigma
100519 18:05:17 0.477078 2084 0.000229 0.000795 0.063251 0.008954
100519 18:05:18 0.264729 1823 0.000145 0.000756 0.061334 0.009960
100519 18:05:19 0.287641 1936 0.000149 0.000720 0.059481 0.009599
100519 18:05:20 0.213181 1619 0.000132 0.000691 0.058050 0.009641
100519 18:05:21 0.276063 1520 0.000182 0.000669 0.056806 0.008587
100519 18:05:22 0.289921 1963 0.000148 0.000642 0.055310 0.008936
100519 18:05:23 0.277754 1882 0.000148 0.000618 0.053983 0.008717
100519 18:05:24 0.337821 1900 0.000178 0.000598 0.052745 0.007963
100519 18:05:25 0.236592 1727 0.000137 0.000579 0.051682 0.008556
100519 18:05:26 0.257150 1488 0.000173 0.000566 0.050821 0.007727
100519 18:05:27 0.303697 1672 0.000182 0.000552 0.049908 0.007412
100519 18:05:28 0.182106 1416 0.000129 0.000539 0.049163 0.008346
100519 18:05:29 0.211202 1631 0.000129 0.000525 0.048347 0.008186
The columns mean the following:
Time is the timestamp of this second’s stats.
Total is the total response time within the sample, in seconds.
Count is how many queries were in that sample.
Avg is just the mean response time (Total / Count).
1-Min Avg is the one-minute moving average of response time.
1-Min StDev is the standard deviation of the average response times for each of the previous 60 seconds.
Sigma is the difference between this second’s average response time and the 1-Min Avg, in standard deviations.
As you can see, most of the time the deviation between this second’s average and the last minute’s average is quite low. But when there’s a meaningful fluctuation in performance, that changes pretty clearly. Here’s a sample with a blip at 18:09:30:
Time Total Count Avg 1-Min Avg 1-Min StDev Sigma
100519 18:09:26 0.187038 1245 0.000150 0.000159 0.006844 0.001343
100519 18:09:27 0.269272 1697 0.000159 0.000160 0.006862 0.000178
100519 18:09:28 0.329386 1901 0.000173 0.000160 0.006895 0.001865
100519 18:09:29 0.350918 2017 0.000174 0.000161 0.006929 0.001881
100519 18:09:30 0.016610 73 0.000228 0.000161 0.006943 0.009537
100519 18:09:31 0.590175 2905 0.000203 0.000162 0.007074 0.005815
100519 18:09:32 0.384193 1879 0.000204 0.000163 0.007133 0.005783
100519 18:09:33 0.345033 2044 0.000169 0.000163 0.007133 0.000815
100519 18:09:34 0.289663 1793 0.000162 0.000163 0.007148 0.000255
That was a fast one! It flew by too quickly to do much about. But it was also not a very large deviation, and could have been a false positive. In any case, I highly doubt that we would have caught anything meaningful by triggering a stats-collection process just then. Let’s keep looking.
Time Total Count Avg 1-Min Avg 1-Min StDev Sigma
100519 18:10:05 0.209619 1578 0.000133 0.000181 0.009542 0.005044
100519 18:10:06 0.279070 1849 0.000151 0.000181 0.009546 0.003167
100519 18:10:07 1.152811 1624 0.000710 0.000189 0.010152 0.051257
100519 18:10:08 0.342763 1450 0.000236 0.000191 0.010188 0.004478
100519 18:10:09 0.200373 1452 0.000138 0.000190 0.010182 0.005155
100519 18:10:10 0.231888 1577 0.000147 0.000191 0.010190 0.004289
100519 18:10:13 0.000459 10 0.000046 0.000191 0.010241 0.014203
100519 18:10:14 0.001999 4 0.000500 0.000189 0.010139 0.030678
100519 18:10:15 0.165705 582 0.000285 0.000189 0.010151 0.009423
100519 18:10:16 7.129640 5104 0.001397 0.000251 0.024463 0.046854
100519 18:10:17 1.140011 2859 0.000399 0.000256 0.024555 0.005817
100519 18:10:18 0.325617 2240 0.000145 0.000255 0.024491 0.004460
100519 18:10:19 0.243101 1538 0.000158 0.000255 0.024510 0.003966
Another relatively short blip but a bit longer. The mean response time really didn’t deviate as much as my client was complaining about — they were showing me New Relic transaction traces with 50-second waits. Maybe I could have caught something here, but I doubt that it’d be enough to separate the signal from the noise. Still, at this point you can clearly see how sensitive this technique is. The deviation in average response varies from a few thousandths of a sigma to a few hundredths. Let’s keep looking for something more dramatic to use as a trigger:
Time Total Count Avg 1-Min Avg 1-Min StDev Sigma
100519 18:10:57 0.352336 2038 0.000173 0.000282 0.026701 0.004092
100519 18:10:58 0.260373 1692 0.000154 0.000283 0.026725 0.004817
100519 18:10:59 1.453306 1834 0.000792 0.000294 0.027073 0.018400
100519 18:11:00 0.264517 1658 0.000160 0.000295 0.027084 0.004989
100519 18:11:01 0.093991 953 0.000099 0.000294 0.027148 0.007203
100519 18:11:02 1.119373 469 0.002387 0.000306 0.027513 0.075629
100519 18:11:03 8.609779 291 0.029587 0.000395 0.038954 0.749383
100519 18:11:04 3.474422 103 0.033732 0.000435 0.040854 0.815026
100519 18:11:05 4.095386 111 0.036895 0.000483 0.043286 0.841211
100519 18:11:06 14.951602 131 0.114134 0.000647 0.065451 1.733932
100519 18:11:07 5.954177 52 0.114503 0.000720 0.068831 1.653074
100519 18:11:08 19.979373 53 0.376969 0.000952 0.096110 3.912385
100519 18:11:09 8.056343 16 0.503521 0.001047 0.100590 4.995285
100519 18:11:10 28.035963 7 4.005138 0.001380 0.138789 28.847777
100519 18:11:11 0.017634 69 0.000256 0.001400 0.139897 0.008182
100519 18:11:12 10.516826 20 0.525841 0.001548 0.145732 3.597660
100519 18:11:13 11.889159 50 0.237783 0.001687 0.151319 1.560253
100519 18:11:14 0.032239 138 0.000234 0.001685 0.151199 0.009599
100519 18:11:15 39.607576 38 1.042305 0.002164 0.204041 5.097713
100519 18:11:16 14.577523 40 0.364438 0.002397 0.215261 1.681870
100519 18:11:17 47.602524 34 1.400074 0.003094 0.278776 5.011118
100519 18:11:18 0.016022 84 0.000191 0.003180 0.282795 0.010570
We totally hit pay dirt here. This period in the log corresponded exactly to one of the visible spikes in New Relic. There were extremely long queries in the log, and throughput dropped to the floor — for an extended time. In the far right-hand column, Sigma is in the double digits. More experience showed me that on this particular client’s workload, anything above 0.3 Sigma is a reliable indicator of a real performance problem. If that condition becomes true, then it’s time to gather diagnostic data for a while. This is resistant to false positives from things like the occasional one-off long-running query.
After building this tool — maybe 30 minutes of work or so — I can see that I could have used other metrics instead. The number of queries per second (throughput) varies, just as response time does. And I probably could go back to the database and start watching Handler_ counters, or similar things like Innodb_rows_read, with the same technique. I wasn’t able to see those things as possibilities because of the overwhelming amount of information to sift through before (and I still don’t really know that they’re going to show spikes and notches the same way, I’m just speculating; they might be really noisy and unreliable). However, focusing on response time is an accurate metric, because response time is what actually matters. Handler counters and rows-read counters are secondary effects that can lie, and there is never anything wrong with focusing on primary sources. Looking at secondary things is far too likely to present you with unreliable information, and you end up on wild goose chases that consume huge amounts of your client’s time.
The tool I wrote for this task is crude, and not formally tested, but it’s a great proof of concept. I think the next step is probably going to be something like revamping mk-loadavg (and probably renaming it!) to be able to capture load metrics and variations in a more flexible and meaningful way.
The end result on this case is at least two problems, by the way (we’re still working on it). One was DNS flakiness. The server was not configured with skip_name_resolve, and when DNS stopped working for a short period, everything stopped working. After clearing that up, many but not all of the spikes in response time went away, and permitted me to see that InnoDB is also having trouble. It is actually quite common for multiple things to be going badly on a server, which makes a disciplined approach all the more important. Trial and error is a disaster in cases like these. Peter and I wrote a brief whitepaper about our approach, by the way. You might find it helpful if you are also facing complex performance problems.
Related posts:Response-time optimization in systems that are queuedBrowser variations in RegExp.exec()A metric for MySQL load averageHow Linux iostat computes its resultsVersion 3.0 of mysqlreport released";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Thu, 27 May 2010 01:34:24 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:4:{i:0;a:5:{s:4:"data";s:7:"Aspersa";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:3:"SQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:5:"Tools";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:9:"New Relic";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:16883:"I wrote recently about early detection for MySQL performance problems. If your server is having micro-fluctuations in performance, it’s important to know, because very soon they will turn much worse. What can you do about this?
The most important thing is not to guess at what’s happening, but to measure instead. I have seen these problems from DNS, the binary log, failing hardware, the query cache, the table cache, the thread cache, and a variety of InnoDB edge cases. Guessing at the problem is very dangerous; you need diagnostic data. But it is often quite hard to catch a problem in action when you can only observe it in hindsight, and it happens only for a few seconds once or twice a week. This blog post is about how to detect small variations in performance, especially when it is most difficult to observe them.
Sometimes it’s actually quite easy, so let’s look at the easy cases first. Over time I have built up a collection of tricks and tools for catching a problem in action. The process of catching and diagnosing a lightning-fast performance problem looks like the following:
- Determine the symptoms of the problem.
- Determine how to observe the symptoms reliably and quickly.
- Determine how to gather diagnostic data for later.
- Set up tools or processes to do the above.
- Sift the collected data and diagnose.
This is 95% about figuring out how to observe the problem and gathering the data, and 5% about actually diagnosing. If you don’t get the 95% right, you’ll gather too little or too much data, or you’ll capture it at the wrong time. Your job is hard enough; you won’t be successful if you simply gather gigs of data for weeks at a time. You need to be as precise as you can. Here are some examples:
- There are normally very few connections to the server, but sometimes I start getting “Error: max_connections exceeded” or similar. Solution: observe Threads_connected from SHOW GLOBAL STATUS and react when it grows too large, or when you cannot log in to query Threads_connected.
- There are normally very few queries running, but we have a connection pool (and thus Threads_connected is constant). During the freezes, hundreds of queries show up in SHOW PROCESSLIST. Solution: observe Threads_running from SHOW GLOBAL STATUS and react when it grows too large.
You get the point. Find a simple metric and figure out how to capture it — usually this is possible with a little bit of bash, awk, and grep. You might need to look for something specific in SHOW INNODB STATUS, for example, such as a large number of transactions in LOCK WAIT status.
But sometimes it’s much harder. What if you simply can’t observe the problem internally to MySQL? This does happen, especially when nothing changes except for response time. This was the case in the customer’s system that I discussed in the “predicting performance problems” blog post linked above. Every single metric provided by MySQL itself stayed constant during the mini-freezes. The problem is that you can’t get information on response time from within MySQL.
I ended up writing tools to help with this, of course. I’ll show the results below.
If fluctuations in response time are the problem, then the way to observe it is to measure response time. This requires some care, because you don’t want false positives, and a lot of my ideas were obviously vulnerable to false positives. I could cross them off right away. I can’t trigger on unusually large or small numbers of queries, for example, because those just happen as the workload fluctuates through the day, and random user behavior naturally introduces variations too.
I ended up writing a tool to tail the slow query log file, which I set to zero so it captured all queries with microsecond precision. Once per second, MySQL writes out the current timestamp to the log file, so when I see that marker, I know that a second’s worth of queries has passed by. I aggregate the last second’s worth of queries (count, total time, average time) and print out a line.
This in itself does not provide a good way to know when something unusual is happening, but it gives the foundation for it. I took it slightly further: I kept a sliding window of the last 60 1-second averages, and took the standard deviation of those. If the current second’s average response time deviates significantly from the average response time over the last 60 seconds, then something is wrong. “Significant” is pretty easy to measure with standard deviations, so that’s where the real magic comes in. Let’s see some samples of this.
First, here’s a bit of the slow query log chopped into 1-second segments and aggregated:
Time Total Count Avg 1-Min Avg 1-Min StDev Sigma
100519 18:05:17 0.477078 2084 0.000229 0.000795 0.063251 0.008954
100519 18:05:18 0.264729 1823 0.000145 0.000756 0.061334 0.009960
100519 18:05:19 0.287641 1936 0.000149 0.000720 0.059481 0.009599
100519 18:05:20 0.213181 1619 0.000132 0.000691 0.058050 0.009641
100519 18:05:21 0.276063 1520 0.000182 0.000669 0.056806 0.008587
100519 18:05:22 0.289921 1963 0.000148 0.000642 0.055310 0.008936
100519 18:05:23 0.277754 1882 0.000148 0.000618 0.053983 0.008717
100519 18:05:24 0.337821 1900 0.000178 0.000598 0.052745 0.007963
100519 18:05:25 0.236592 1727 0.000137 0.000579 0.051682 0.008556
100519 18:05:26 0.257150 1488 0.000173 0.000566 0.050821 0.007727
100519 18:05:27 0.303697 1672 0.000182 0.000552 0.049908 0.007412
100519 18:05:28 0.182106 1416 0.000129 0.000539 0.049163 0.008346
100519 18:05:29 0.211202 1631 0.000129 0.000525 0.048347 0.008186
The columns mean the following:
- Time is the timestamp of this second’s stats.
- Total is the total response time within the sample, in seconds.
- Count is how many queries were in that sample.
- Avg is just the mean response time (Total / Count).
- 1-Min Avg is the one-minute moving average of response time.
- 1-Min StDev is the standard deviation of the average response times for each of the previous 60 seconds.
- Sigma is the difference between this second’s average response time and the 1-Min Avg, in standard deviations.
As you can see, most of the time the deviation between this second’s average and the last minute’s average is quite low. But when there’s a meaningful fluctuation in performance, that changes pretty clearly. Here’s a sample with a blip at 18:09:30:
Time Total Count Avg 1-Min Avg 1-Min StDev Sigma
100519 18:09:26 0.187038 1245 0.000150 0.000159 0.006844 0.001343
100519 18:09:27 0.269272 1697 0.000159 0.000160 0.006862 0.000178
100519 18:09:28 0.329386 1901 0.000173 0.000160 0.006895 0.001865
100519 18:09:29 0.350918 2017 0.000174 0.000161 0.006929 0.001881
100519 18:09:30 0.016610 73 0.000228 0.000161 0.006943 0.009537
100519 18:09:31 0.590175 2905 0.000203 0.000162 0.007074 0.005815
100519 18:09:32 0.384193 1879 0.000204 0.000163 0.007133 0.005783
100519 18:09:33 0.345033 2044 0.000169 0.000163 0.007133 0.000815
100519 18:09:34 0.289663 1793 0.000162 0.000163 0.007148 0.000255
That was a fast one! It flew by too quickly to do much about. But it was also not a very large deviation, and could have been a false positive. In any case, I highly doubt that we would have caught anything meaningful by triggering a stats-collection process just then. Let’s keep looking.
Time Total Count Avg 1-Min Avg 1-Min StDev Sigma
100519 18:10:05 0.209619 1578 0.000133 0.000181 0.009542 0.005044
100519 18:10:06 0.279070 1849 0.000151 0.000181 0.009546 0.003167
100519 18:10:07 1.152811 1624 0.000710 0.000189 0.010152 0.051257
100519 18:10:08 0.342763 1450 0.000236 0.000191 0.010188 0.004478
100519 18:10:09 0.200373 1452 0.000138 0.000190 0.010182 0.005155
100519 18:10:10 0.231888 1577 0.000147 0.000191 0.010190 0.004289
100519 18:10:13 0.000459 10 0.000046 0.000191 0.010241 0.014203
100519 18:10:14 0.001999 4 0.000500 0.000189 0.010139 0.030678
100519 18:10:15 0.165705 582 0.000285 0.000189 0.010151 0.009423
100519 18:10:16 7.129640 5104 0.001397 0.000251 0.024463 0.046854
100519 18:10:17 1.140011 2859 0.000399 0.000256 0.024555 0.005817
100519 18:10:18 0.325617 2240 0.000145 0.000255 0.024491 0.004460
100519 18:10:19 0.243101 1538 0.000158 0.000255 0.024510 0.003966
Another relatively short blip but a bit longer. The mean response time really didn’t deviate as much as my client was complaining about — they were showing me New Relic transaction traces with 50-second waits. Maybe I could have caught something here, but I doubt that it’d be enough to separate the signal from the noise. Still, at this point you can clearly see how sensitive this technique is. The deviation in average response varies from a few thousandths of a sigma to a few hundredths. Let’s keep looking for something more dramatic to use as a trigger:
Time Total Count Avg 1-Min Avg 1-Min StDev Sigma
100519 18:10:57 0.352336 2038 0.000173 0.000282 0.026701 0.004092
100519 18:10:58 0.260373 1692 0.000154 0.000283 0.026725 0.004817
100519 18:10:59 1.453306 1834 0.000792 0.000294 0.027073 0.018400
100519 18:11:00 0.264517 1658 0.000160 0.000295 0.027084 0.004989
100519 18:11:01 0.093991 953 0.000099 0.000294 0.027148 0.007203
100519 18:11:02 1.119373 469 0.002387 0.000306 0.027513 0.075629
100519 18:11:03 8.609779 291 0.029587 0.000395 0.038954 0.749383
100519 18:11:04 3.474422 103 0.033732 0.000435 0.040854 0.815026
100519 18:11:05 4.095386 111 0.036895 0.000483 0.043286 0.841211
100519 18:11:06 14.951602 131 0.114134 0.000647 0.065451 1.733932
100519 18:11:07 5.954177 52 0.114503 0.000720 0.068831 1.653074
100519 18:11:08 19.979373 53 0.376969 0.000952 0.096110 3.912385
100519 18:11:09 8.056343 16 0.503521 0.001047 0.100590 4.995285
100519 18:11:10 28.035963 7 4.005138 0.001380 0.138789 28.847777
100519 18:11:11 0.017634 69 0.000256 0.001400 0.139897 0.008182
100519 18:11:12 10.516826 20 0.525841 0.001548 0.145732 3.597660
100519 18:11:13 11.889159 50 0.237783 0.001687 0.151319 1.560253
100519 18:11:14 0.032239 138 0.000234 0.001685 0.151199 0.009599
100519 18:11:15 39.607576 38 1.042305 0.002164 0.204041 5.097713
100519 18:11:16 14.577523 40 0.364438 0.002397 0.215261 1.681870
100519 18:11:17 47.602524 34 1.400074 0.003094 0.278776 5.011118
100519 18:11:18 0.016022 84 0.000191 0.003180 0.282795 0.010570
We totally hit pay dirt here. This period in the log corresponded exactly to one of the visible spikes in New Relic. There were extremely long queries in the log, and throughput dropped to the floor — for an extended time. In the far right-hand column, Sigma is in the double digits. More experience showed me that on this particular client’s workload, anything above 0.3 Sigma is a reliable indicator of a real performance problem. If that condition becomes true, then it’s time to gather diagnostic data for a while. This is resistant to false positives from things like the occasional one-off long-running query.
After building this tool — maybe 30 minutes of work or so — I can see that I could have used other metrics instead. The number of queries per second (throughput) varies, just as response time does. And I probably could go back to the database and start watching Handler_ counters, or similar things like Innodb_rows_read, with the same technique. I wasn’t able to see those things as possibilities because of the overwhelming amount of information to sift through before (and I still don’t really know that they’re going to show spikes and notches the same way, I’m just speculating; they might be really noisy and unreliable). However, focusing on response time is an accurate metric, because response time is what actually matters. Handler counters and rows-read counters are secondary effects that can lie, and there is never anything wrong with focusing on primary sources. Looking at secondary things is far too likely to present you with unreliable information, and you end up on wild goose chases that consume huge amounts of your client’s time.
The tool I wrote for this task is crude, and not formally tested, but it’s a great proof of concept. I think the next step is probably going to be something like revamping mk-loadavg (and probably renaming it!) to be able to capture load metrics and variations in a more flexible and meaningful way.
The end result on this case is at least two problems, by the way (we’re still working on it). One was DNS flakiness. The server was not configured with skip_name_resolve, and when DNS stopped working for a short period, everything stopped working. After clearing that up, many but not all of the spikes in response time went away, and permitted me to see that InnoDB is also having trouble. It is actually quite common for multiple things to be going badly on a server, which makes a disciplined approach all the more important. Trial and error is a disaster in cases like these. Peter and I wrote a brief whitepaper about our approach, by the way. You might find it helpful if you are also facing complex performance problems.
Related posts:
- Response-time optimization in systems that are queued
- Browser variations in RegExp.exec()
- A metric for MySQL load average
- How Linux iostat computes its results
- Version 3.0 of mysqlreport released
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:22:"Baron Schwartz (xaprb)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:32;a:6:{s:4:"data";s:78:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:42:"Detecting invalid and zero temporal values";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:32:"http://hackmysql.com/blog/?p=119";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:80:"http://hackmysql.com/blog/2010/05/26/detecting-invalid-and-zero-temporal-values/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:4783:"I’ve been thinking a lot about invalid and zero temporal values and how to detect them with MySQL date and time functions because mk-table-checksum has to handle “everything” correctly and efficiently. The requirements are complex because we have to take into account what MySQL allows to be stored verses what it allows to be used in certain operations and functions, how it sorts a mix of real and invalid temporal values for MIN() and MAX(), how to detect a temporal value as equivalent to zero, and how different MySQL versions might affect any of the aforementioned.
At base, the four guiding requirements are:
Detect and discard invalid time, date, and datetime values
Detect zero-equivalent temporal values
Do #1 and #2 using only MySQL functions
Work in MySQL 4.0 and newer
My tests cases for invalid temporal values are:
00:00:60
00:60:00
999-00-00
999-01-01
0000-00-00
2009-00-00
2009-13-00
999-00-00 00:00:00
999-01-01 00:00:00
0000-00-00 00:00:00
1000-00-00 00:00:00
2009-00-00 00:00:00
2009-13-00 00:00:00
2009-05-26 00:00:60
2009-05-26 00:60:00
2009-05-26 24:00:00
And my test cases for first real temporal values are:
00:00:00
00:00:01
1000-01-01
2009-01-01
1000-01-01 00:00:00
2009-01-01 00:00:00
And there is only one real zero-equivalent temporal value: 00:00:00.
So the first requirement is to find a MySQL function that returns NULL for all those invalid values, and that function is TO_DAYS with one exception:
mysql> SELECT TO_DAYS('999-01-01 00:00:00');
+-------------------------------+
| TO_DAYS('999-01-01 00:00:00') |
+-------------------------------+
| 364878 |
+-------------------------------+
That date is only valid if years before 1000 are handled but the MySQL manual says that,
TO_DAYS() is not intended for use with values that precede the advent of the Gregorian calendar (1582)
so we’re already way past the limit of its intended use and, moreover, the supported lower limit of a date or datetime is 1000-01-01, so says the manual. It’s reasonable to not bother with pre-year 1000 dates so I’ll overlook this.
Excepting pre-year 1000 dates, TO_DAYS() returns NULL for all the invalid values. By contrast, UNIX_TIMESTAMP() returns zero for all the invalid values and TIME_TO_SEC() returns a mix of NULL, zero, and values. So the apparent winner for requirement #1 is TO_DAYS(), but…
Requirement #2 complicates the issue because the time 00:00:00 is valid and zero-equivalent but TO_DAYS() returns NULL for it. We need a hack that handles all the cases, and here it is:
SELECT IF(TIME_FORMAT(?,'%H:%i:%s')=?, TIME_TO_SEC(?), TO_DAYS(?))
That says, basically: if the value is a time then evaluate it with TIME_TO_SEC(), else evaluate it with TO_DAYS(). It works so well in fact that it satisfies all four requirements. 00:00:00 evaluates to zero, all the invalid values evaluate to NULL, and all the valid values evaluate to various non-null values. I have to use TIME_FORMAT() instead of just TIME() because TIME() wasn’t introduced until MySQL v4.1 (fourth requirement).
The hack works because of this (substituting TIME() for TIME_FORMAT()):
mysql> SELECT TIME('00:00:00');
+------------------+
| TIME('00:00:00') |
+------------------+
| 00:00:00 |
+------------------+
mysql> SELECT TIME('00-00-00');
+------------------+
| TIME('00-00-00') |
+------------------+
| 00:00:00 |
+------------------+
mysql> SELECT TIME('2010-05-26');
+--------------------+
| TIME('2010-05-26') |
+--------------------+
| 00:20:10 |
+--------------------+
mysql> SELECT TIME('2010-05-26 10:10:10');
+-----------------------------+
| TIME('2010-05-26 10:10:10') |
+-----------------------------+
| 10:10:10 |
+-----------------------------+
As you can see, TIME() (or TIME_FORMAT()) returns the exact same value if the given value is a time, otherwise it interprets the value–which is a date or datetime–as a time causing it to return a different value than the given value. Thus we discern time values from date and datetime values and evaluate them separately with TIME_TO_SEC().
I tested on MySQL v4.0, 4.1, 5.0 and 5.1 and all pass. The only difference is 4.0 verses the others for the pre-year 1000 dates, but I’m ignoring these anyway.
Of course all the preceding could have been accomplished in code by looking at the column type and choosing the correct MySQL function to evaluate the value and check if it’s zero-equivalent, but I was curious to see if it could be done using only MySQL since, after all, it is MySQL that permits these silly, invalid temporals values.
If you know a simpler, more elegant solution that meets the four requirements and passes all the tests, please share!";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 26 May 2010 23:48:31 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:8:{i:0;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:8:"datetime";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:4:"time";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:9:"timestamp";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:11:"TIME_FORMAT";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:11:"TIME_TO_SEC";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:7:"TO_DAYS";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:7;a:5:{s:4:"data";s:14:"UNIX_TIMESTAMP";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:6192:"I’ve been thinking a lot about invalid and zero temporal values and how to detect them with MySQL date and time functions because mk-table-checksum has to handle “everything” correctly and efficiently. The requirements are complex because we have to take into account what MySQL allows to be stored verses what it allows to be used in certain operations and functions, how it sorts a mix of real and invalid temporal values for MIN() and MAX(), how to detect a temporal value as equivalent to zero, and how different MySQL versions might affect any of the aforementioned.
At base, the four guiding requirements are:
- Detect and discard invalid time, date, and datetime values
- Detect zero-equivalent temporal values
- Do #1 and #2 using only MySQL functions
- Work in MySQL 4.0 and newer
My tests cases for invalid temporal values are:
00:00:60
00:60:00
999-00-00
999-01-01
0000-00-00
2009-00-00
2009-13-00
999-00-00 00:00:00
999-01-01 00:00:00
0000-00-00 00:00:00
1000-00-00 00:00:00
2009-00-00 00:00:00
2009-13-00 00:00:00
2009-05-26 00:00:60
2009-05-26 00:60:00
2009-05-26 24:00:00
And my test cases for first real temporal values are:
00:00:00
00:00:01
1000-01-01
2009-01-01
1000-01-01 00:00:00
2009-01-01 00:00:00
And there is only one real zero-equivalent temporal value: 00:00:00.
So the first requirement is to find a MySQL function that returns NULL for all those invalid values, and that function is TO_DAYS with one exception:
mysql> SELECT TO_DAYS('999-01-01 00:00:00');
+-------------------------------+
| TO_DAYS('999-01-01 00:00:00') |
+-------------------------------+
| 364878 |
+-------------------------------+
That date is only valid if years before 1000 are handled but the MySQL manual says that,
TO_DAYS() is not intended for use with values that precede the advent of the Gregorian calendar (1582)
so we’re already way past the limit of its intended use and, moreover, the supported lower limit of a date or datetime is 1000-01-01, so says the manual. It’s reasonable to not bother with pre-year 1000 dates so I’ll overlook this.
Excepting pre-year 1000 dates, TO_DAYS() returns NULL for all the invalid values. By contrast, UNIX_TIMESTAMP() returns zero for all the invalid values and TIME_TO_SEC() returns a mix of NULL, zero, and values. So the apparent winner for requirement #1 is TO_DAYS(), but…
Requirement #2 complicates the issue because the time 00:00:00 is valid and zero-equivalent but TO_DAYS() returns NULL for it. We need a hack that handles all the cases, and here it is:
SELECT IF(TIME_FORMAT(?,'%H:%i:%s')=?, TIME_TO_SEC(?), TO_DAYS(?))
That says, basically: if the value is a time then evaluate it with TIME_TO_SEC(), else evaluate it with TO_DAYS(). It works so well in fact that it satisfies all four requirements. 00:00:00 evaluates to zero, all the invalid values evaluate to NULL, and all the valid values evaluate to various non-null values. I have to use TIME_FORMAT() instead of just TIME() because TIME() wasn’t introduced until MySQL v4.1 (fourth requirement).
The hack works because of this (substituting TIME() for TIME_FORMAT()):
mysql> SELECT TIME('00:00:00');
+------------------+
| TIME('00:00:00') |
+------------------+
| 00:00:00 |
+------------------+
mysql> SELECT TIME('00-00-00');
+------------------+
| TIME('00-00-00') |
+------------------+
| 00:00:00 |
+------------------+
mysql> SELECT TIME('2010-05-26');
+--------------------+
| TIME('2010-05-26') |
+--------------------+
| 00:20:10 |
+--------------------+
mysql> SELECT TIME('2010-05-26 10:10:10');
+-----------------------------+
| TIME('2010-05-26 10:10:10') |
+-----------------------------+
| 10:10:10 |
+-----------------------------+
As you can see, TIME() (or TIME_FORMAT()) returns the exact same value if the given value is a time, otherwise it interprets the value–which is a date or datetime–as a time causing it to return a different value than the given value. Thus we discern time values from date and datetime values and evaluate them separately with TIME_TO_SEC().
I tested on MySQL v4.0, 4.1, 5.0 and 5.1 and all pass. The only difference is 4.0 verses the others for the pre-year 1000 dates, but I’m ignoring these anyway.
Of course all the preceding could have been accomplished in code by looking at the column type and choosing the correct MySQL function to evaluate the value and check if it’s zero-equivalent, but I was curious to see if it could be done using only MySQL since, after all, it is MySQL that permits these silly, invalid temporals values.
If you know a simpler, more elegant solution that meets the four requirements and passes all the tests, please share!
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:14:"Daniel Nichter";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:33;a:6:{s:4:"data";s:68:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:30:"A small issue of SQL standards";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:59:"tag:blogger.com,1999:blog-15319370.post-7122558264129037578";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:70:"http://rpbouman.blogspot.com/2010/05/small-issue-of-sql-standards.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:5271:"From a functional perspective, the core SQL support in all major and minor RDBMS-es is reasonably similar. In this light, it's sometimes quite disturbing to find how some very basic things work so differently across different products. Consider this simple statement:SELECT 'a' /* this is a comment */ 'b'FROM onerowWhat should the result be? (You can assume that onerow is an existing table that contains one row)It turns out popular RDBMS-es mostly disagree with one another.In Oracle XE, we get this:SELECT 'a' /* comment */ 'b' *ERROR at line 1:ORA-00923: FROM keyword not found where expectedPostgreSQL 8.4 also treats it as a syntax error, and thus seems compatible with Oracle's behavior: ERROR: syntax error at or near "'b'"LINE 1: SELECT 'a' /* this is a comment */ 'b'In Microsoft SQL Server 2008 we get: b-a(1 rows affected)As you can see, MS SQL treats the query as SELECT 'a' AS b FROM onerow.Finally, in MySQL 5.1, we get:+----+| a |+----+| ab |+----+1 row in set (0.00 sec)So in MySQL, its as if the comment isn't there at all, and as if the string literals 'a' and 'b' are actually just one string literal 'ab'.So what does the SQL standard say? In my copy of the 2003 edition, I find this (ISO/IEC 9075-2:2003 (E) 5.3 <literal>, page 145):Syntax Rules1) In a <character string literal> or <national character string literal>, the sequence:<quote> <character representation>... <quote> <separator> <quote> <character representation>... <quote>is equivalent to the sequence<quote> <character representation>... <character representation>... <quote>If we lookup the definition of <separator>, it reads: <separator> ::= { <comment> | <white space> }...So in this case, MySQL does the "right" thing, and basically ignores the comment, treating 'a' and 'b' as a single string constant 'ab'.UPDATE 1: As always, the devil is in the details. And trust me, the SQL standard has many of them (details that is - I'll leave it up to the reader to decide for the devils, although I have a suspicion in a particular direction). Read on, and make sure to read Nick's comment on this post - it turns out PostgreSQL seems to behave exactly according to the standard in this case.UPDATE 2: Serg also posted a comment citing yet another part of the standard that states that all comments implicitly count as a newline. This would mean that there doesn't have to be a literal newline character in or following the comment. In this case, my original remark that MySQL got it right would hold again.I should state that I think very highly of both Nick and Serg, and as far as I am concerned, they're both right. I can't help but seeing this as yet more support for my statement that the SQL standard is so complex it is almost or perhaps completely impossible to get it right. Do you find this too bold? If so, I'd really love to hear your thoughts on it. Please help us solve this paradox, I only want to understand what the standard really says.If you try the same thing with a single line comment, all products mentioned react the same as with the initial query, except for PostgreSQL, which now treats the query according to the standard.Now don't get me wrong. This post is not designed to bash or glorify any of the products mentioned. I think all of them are great in their own way. I am quite aware that although MySQL happens to adhere to the standard here, it violates it in other places. Finally, I should point out that I don't have a specific opinion on what the right behavior should be. I just want it to be the same on all platforms.At the same time, I realize that for SQL it's probably too late - up to an extent, incompatibility is considered normal, and database professionals tend to be specialized in particular products anyway. So I'm not holding my breath for the grand unification of SQL dialects.When I encountered this issue, I did have to think about that other rathole of incompatibilities I have to deal with professionally, which is web-browsers. An interesting development there is the HTML 5 specification, which actually defines an algorithm for parsing HTML - even invalid HTML. This is quite different from the approach taken by most standards, which typically define only an abstract grammar, but leave the implementation entirely up to the vendors. In theory, providing parsing instructions as detailed as done in HTML 5 should make it easier to create correct parsers, and hopefully this will contribute to a more robust web.Anyway. That was all. Back to work...UPDATE: I just heard that Sybase (unsurprisingly) behaves similar to MS SQL for this query (that is, query is valid, and returns 'a' in a column called b). I checked SQLite myself, which is also in that camp.Nick also pointed out that LucidDB also provides a standard compliant implementation, in other words, it behaves exactly like PostgreSQL for this particular query. However, Julian, who was and is closely involved in LucidDB agrees with Serg that the comment should probably count as a newline, and filed a bug for LucidDB.I checked Firebird 2.1.3, and they are in the Oracle camp: in both cases, the query gives a syntax error.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 26 May 2010 19:45:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:6:{i:0;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:10:"PostgreSQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:6:"Oracle";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:3:"SQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:13:"Ms SQL Server";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:8:"ISO 9075";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:6314:"From a functional perspective, the core SQL support in all major and minor RDBMS-es is reasonably similar. In this light, it's sometimes quite disturbing to find how some very basic things work so differently across different products. Consider this simple statement:SELECT 'a' /* this is a comment */ 'b'
FROM onerow
What should the result be? (You can assume that onerow is an existing table that contains one row)
It turns out popular RDBMS-es mostly disagree with one another.
In Oracle XE, we get this:SELECT 'a' /* comment */ 'b'
*
ERROR at line 1:
ORA-00923: FROM keyword not found where expected
PostgreSQL 8.4 also treats it as a syntax error, and thus seems compatible with Oracle's behavior: ERROR: syntax error at or near "'b'"
LINE 1: SELECT 'a' /* this is a comment */ 'b'
In Microsoft SQL Server 2008 we get: b
-
a
(1 rows affected)
As you can see, MS SQL treats the query as SELECT 'a' AS b FROM onerow.
Finally, in MySQL 5.1, we get:+----+
| a |
+----+
| ab |
+----+
1 row in set (0.00 sec)
So in MySQL, its as if the comment isn't there at all, and as if the string literals 'a' and 'b' are actually just one string literal 'ab'.
So what does the SQL standard say? In my copy of the 2003 edition, I find this (ISO/IEC 9075-2:2003 (E) 5.3 <literal>, page 145):Syntax Rules
1) In a <character string literal> or <national character string literal>, the sequence:<quote> <character representation>... <quote> <separator> <quote> <character representation>... <quote>
is equivalent to the sequence<quote> <character representation>... <character representation>... <quote>
If we lookup the definition of <separator>, it reads: <separator> ::= { <comment> | <white space> }...So in this case, MySQL does the "right" thing, and basically ignores the comment, treating 'a' and 'b' as a single string constant 'ab'.
UPDATE 1: As always, the devil is in the details. And trust me, the SQL standard has many of them (details that is - I'll leave it up to the reader to decide for the devils, although I have a suspicion in a particular direction). Read on, and make sure to read Nick's comment on this post - it turns out PostgreSQL seems to behave exactly according to the standard in this case.
UPDATE 2: Serg also posted a comment citing yet another part of the standard that states that all comments implicitly count as a newline. This would mean that there doesn't have to be a literal newline character in or following the comment. In this case, my original remark that MySQL got it right would hold again.
I should state that I think very highly of both Nick and Serg, and as far as I am concerned, they're both right. I can't help but seeing this as yet more support for my statement that the SQL standard is so complex it is almost or perhaps completely impossible to get it right.
Do you find this too bold? If so, I'd really love to hear your thoughts on it. Please help us solve this paradox, I only want to understand what the standard really says.
If you try the same thing with a single line comment, all products mentioned react the same as with the initial query, except for PostgreSQL, which now treats the query according to the standard.
Now don't get me wrong. This post is not designed to bash or glorify any of the products mentioned. I think all of them are great in their own way. I am quite aware that although MySQL happens to adhere to the standard here, it violates it in other places. Finally, I should point out that I don't have a specific opinion on what the right behavior should be. I just want it to be the same on all platforms.
At the same time, I realize that for SQL it's probably too late - up to an extent, incompatibility is considered normal, and database professionals tend to be specialized in particular products anyway. So I'm not holding my breath for the grand unification of SQL dialects.
When I encountered this issue, I did have to think about that other rathole of incompatibilities I have to deal with professionally, which is web-browsers. An interesting development there is the HTML 5 specification, which actually defines an algorithm for parsing HTML - even invalid HTML. This is quite different from the approach taken by most standards, which typically define only an abstract grammar, but leave the implementation entirely up to the vendors. In theory, providing parsing instructions as detailed as done in HTML 5 should make it easier to create correct parsers, and hopefully this will contribute to a more robust web.
Anyway. That was all. Back to work...
UPDATE: I just heard that Sybase (unsurprisingly) behaves similar to MS SQL for this query (that is, query is valid, and returns 'a' in a column called b). I checked SQLite myself, which is also in that camp.
Nick also pointed out that LucidDB also provides a standard compliant implementation, in other words, it behaves exactly like PostgreSQL for this particular query. However, Julian, who was and is closely involved in LucidDB agrees with Serg that the comment should probably count as a newline, and filed a bug for LucidDB.
I checked Firebird 2.1.3, and they are in the Oracle camp: in both cases, the query gives a syntax error.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:13:"Roland Bouman";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:34;a:6:{s:4:"data";s:43:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:28:"MySQL, what are you smoking?";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:70:"tag:blogger.com,1999:blog-4622775563416752930.post-6629528762598690058";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:73:"http://marksverbiage.blogspot.com/2010/05/mysql-what-are-you-smoking.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:2584:"There are a lot of weird things which MySQL does to handle its mix of transactional and non-transactional behaviour, but this one was new to me :)create table t1 (ID INT NOT NULL PRIMARY KEY, V INT NOT NULL);Query OK, 0 rows affected (0.01 sec)mysql> insert into t1 (ID,V) VALUES (2,NULL);ERROR 1048 (23000): Column 'V' cannot be nullmysql> insert into t1 (ID,V) VALUES (3,1),(4,1);Query OK, 2 rows affected (0.00 sec)Records: 2 Duplicates: 0 Warnings: 0insert into t1 (ID,V) VALUES (5,1),(6,NULL);Query OK, 2 rows affected, 1 warning (0.00 sec)Records: 2 Duplicates: 0 Warnings: 1mysql> show warnings;+---------+------+---------------------------+| Level | Code | Message |+---------+------+---------------------------+| Warning | 1048 | Column 'V' cannot be null |+---------+------+---------------------------+1 row in set (0.00 sec)select * from t1;+----+---+| ID | V |+----+---+| 3 | 1 || 4 | 1 || 5 | 1 || 6 | 0 |+----+---+4 rows in set (0.00 sec)What's going on?MySQL does not consider (some) errors to be errors, if they happen on the second or subsequent row of a multi-row insert.On the other hand, if it happens on the first row, it's an error.Why?Because non-transactional engines can't rollback to a savepoint. This means that if it's inserted one or more rows already, to generate an error would leave some stuff in the database.No, really why?I don't know. This is not consistent with, for example, a unique index violation, which makes it stop half way through a multi-row insert anyway, and non-transactional engines can't rollback.So if you insert a duplicate, THAT still generates an error on the second a subsequent row. It's not even consistent!mysql> insert into t1 (ID,V) VALUES (10,1),(10,2);ERROR 1062 (23000): Duplicate entry '10' for key 'PRIMARY'mysql> select * from t1;+----+---+| ID | V |+----+---+| 3 | 1 || 4 | 1 || 5 | 1 || 6 | 0 || 10 | 1 |+----+---+5 rows in set (0.00 sec)Of course if we use a transactional engine, it looks better:mysql> ALTER TABLE t1 ENGINE=InnoDB;Query OK, 5 rows affected (0.00 sec)Records: 5 Duplicates: 0 Warnings: 0mysql> insert into t1 (ID,V) VALUES (20,1),(20,2);ERROR 1062 (23000): Duplicate entry '20' for key 'PRIMARY'mysql> select * from t1 WHERE ID=20;Empty set (0.00 sec)But that won't change the behaviour of a multi-row insert with NULLs in invalid places.This kind of stuff is nonsense and we need it to GO AWAY NOW.How?SET SQL_MODE='STRICT_ALL_TABLES' And now every error is really an error. Yay! Why can't this be the default? (I know the answer; this is a rhetorical question)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 26 May 2010 16:59:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:1:{i:0;a:5:{s:4:"data";s:19:"mysql nonsense rant";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:3597:"There are a lot of weird things which MySQL does to handle its mix of transactional and non-transactional behaviour, but this one was new to me :)
create table t1 (ID INT NOT NULL PRIMARY KEY, V INT NOT NULL);
Query OK, 0 rows affected (0.01 sec)
mysql> insert into t1 (ID,V) VALUES (2,NULL);
ERROR 1048 (23000): Column 'V' cannot be null
mysql> insert into t1 (ID,V) VALUES (3,1),(4,1);
Query OK, 2 rows affected (0.00 sec)
Records: 2 Duplicates: 0 Warnings: 0
insert into t1 (ID,V) VALUES (5,1),(6,NULL);
Query OK, 2 rows affected, 1 warning (0.00 sec)
Records: 2 Duplicates: 0 Warnings: 1
mysql> show warnings;
+---------+------+---------------------------+
| Level | Code | Message |
+---------+------+---------------------------+
| Warning | 1048 | Column 'V' cannot be null |
+---------+------+---------------------------+
1 row in set (0.00 sec)
select * from t1;
+----+---+
| ID | V |
+----+---+
| 3 | 1 |
| 4 | 1 |
| 5 | 1 |
| 6 | 0 |
+----+---+
4 rows in set (0.00 sec)
What's going on?
MySQL does not consider (some) errors to be errors, if they happen on the second or subsequent row of a multi-row insert.
On the other hand, if it happens on the first row, it's an error.
Why?
Because non-transactional engines can't rollback to a savepoint. This means that if it's inserted one or more rows already, to generate an error would leave some stuff in the database.
No, really why?
I don't know. This is not consistent with, for example, a unique index violation, which makes it stop half way through a multi-row insert anyway, and non-transactional engines can't rollback.
So if you insert a duplicate, THAT still generates an error on the second a subsequent row. It's not even consistent!
mysql> insert into t1 (ID,V) VALUES (10,1),(10,2);
ERROR 1062 (23000): Duplicate entry '10' for key 'PRIMARY'
mysql> select * from t1;
+----+---+
| ID | V |
+----+---+
| 3 | 1 |
| 4 | 1 |
| 5 | 1 |
| 6 | 0 |
| 10 | 1 |
+----+---+
5 rows in set (0.00 sec)
Of course if we use a transactional engine, it looks better:
mysql> ALTER TABLE t1 ENGINE=InnoDB;
Query OK, 5 rows affected (0.00 sec)
Records: 5 Duplicates: 0 Warnings: 0
mysql> insert into t1 (ID,V) VALUES (20,1),(20,2);
ERROR 1062 (23000): Duplicate entry '20' for key 'PRIMARY'
mysql> select * from t1 WHERE ID=20;
Empty set (0.00 sec)
But that won't change the behaviour of a multi-row insert with NULLs in invalid places.
This kind of stuff is nonsense and we need it to GO AWAY NOW.
How?
SET SQL_MODE='STRICT_ALL_TABLES'
And now every error is really an error. Yay! Why can't this be the default? (I know the answer; this is a rhetorical question)
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:11:"Mark Robson";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:35;a:6:{s:4:"data";s:48:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:32:"Percona Welcomes Justin Swanhart";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:43:"http://www.mysqlperformanceblog.com/?p=2667";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:80:"http://www.mysqlperformanceblog.com/2010/05/26/percona-welcomes-justin-swanhart/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:766:"Percona is pleased to officially welcome Justin Swanhart to our team of consultants.
Before joining Percona, Justin worked as a MySQL DBA at Gazillion, Yahoo, and Kickfire. Justin has become a regular contributor here on the MySQL Performance Blog as well as being an active blogger at http://swanhart.livejournal.com/. He is very active in the community, maintaining FlexViews, a stored procedure managed solution for materialized view creation and maintenance in MySQL 5.1 as well as instrumentation-for-php, a suite of PHP classes for easing the implementation of instrumentation in applications.
Justin, a big welcome – we are fortunate indeed you’re working with us!
Entry posted by Ryan Lowe |
2 comments
Add to: | | | | ";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 26 May 2010 14:03:25 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:2:{i:0;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:7:"percona";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:2946:"Percona is pleased to officially welcome Justin Swanhart to our team of consultants.
Before joining Percona, Justin worked as a MySQL DBA at Gazillion, Yahoo, and Kickfire. Justin has become a regular contributor here on the MySQL Performance Blog as well as being an active blogger at http://swanhart.livejournal.com/. He is very active in the community, maintaining FlexViews, a stored procedure managed solution for materialized view creation and maintenance in MySQL 5.1 as well as instrumentation-for-php, a suite of PHP classes for easing the implementation of instrumentation in applications.
Justin, a big welcome – we are fortunate indeed you’re working with us!
Entry posted by Ryan Lowe |
2 comments
Add to:
|
|
|
| 
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:9:"Ryan Lowe";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:36;a:6:{s:4:"data";s:58:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:45:"BLOBS in the Drizzle/MySQL Storage Engine API";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:40:"http://www.flamingspork.com/blog/?p=1931";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:89:"http://www.flamingspork.com/blog/2010/05/26/blobs-in-the-drizzlemysql-storage-engine-api/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:2781:"Another (AFAIK) undocumented part of the Storage Engine API:
We all know what a normal row looks like in Drizzle/MySQL row format (a NULL bitmap and then column data):
Nothing that special. It’s a fixed sized buffer, Field objects reference into it, you read out of it and write the values into your engine. However, when you get to BLOBs, we can’t use a fixed sized buffer as BLOBs may be quite large. So, the format with BLOBS is the bit in the row is a length of the blob (1, 2, 3 or 4 bytes – in Drizzle it’s only 3 or 4 bytes now and soon only 4 bytes once we fix a bug that isn’t interesting to discuss here). The Second part of the in-row part is a pointer to a location in memory where the BLOB is stored. So a row that has a BLOB in it looks something like this:
The size of the pointer is (of course) platform dependent. On 32bit machines it’s 4 bytes and on 64bit machines it’s 8 bytes.
Now, if I were any other source of documentation, I’d stop right here.
But I’m not. I’m a programmer writing a Storage Engine who now has the crucial question of memory management.
When your engine is given the row from the upper layer (such as doInsertRecord()/write_row()) you don’t have to worry, for the duration of the call, the memory will be there (don’t count on it being there after though, so if you’re not going to immediately splat it somewhere, make your own copy).
For reading, you are expected to provide a pointer to a location in memory that is valid until the next call to your Cursor. For example, rnd_next() call reads a BLOB field and your engine provides a pointer. At the subsequent rnd_next() call, it can free that pointer (or at doStopTableScan()/rnd_end()).
HOWEVER, this is true except for index_read_idx_map(), which in the default implementation in the Cursor (handler) base class ends up doing a doStartIndexScan(), index_read(), doEndIndexScan(). This means that if a BLOB was read, the engine could have (quite rightly) freed that memory already. In this case, you must keep the memory around until either a reset() or extra(HA_EXTRA_FLUSH) call.
This exception is tested (by accident) by a whole single query in type_blob.test – a monster of a query that’s about a seven way join with a group by and an order by. It would be quite possible to write a fairly functional engine and completely miss this.
Good luck.
This blog post (but not the whole blog) is published under the Creative Commons Attribution-Share Alike License. Attribution is by linking back to this post and mentioning my name (Stewart Smith).
Share this on Facebook
Tweet This!
Share this on del.icio.us
Digg this!
Post on Google Buzz
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 26 May 2010 13:46:32 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:4:{i:0;a:5:{s:4:"data";s:7:"drizzle";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:4:"BLOB";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:18:"storage engine api";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:4929:"Another (AFAIK) undocumented part of the Storage Engine API:
We all know what a normal row looks like in Drizzle/MySQL row format (a NULL bitmap and then column data):

Nothing that special. It’s a fixed sized buffer, Field objects reference into it, you read out of it and write the values into your engine. However, when you get to BLOBs, we can’t use a fixed sized buffer as BLOBs may be quite large. So, the format with BLOBS is the bit in the row is a length of the blob (1, 2, 3 or 4 bytes – in Drizzle it’s only 3 or 4 bytes now and soon only 4 bytes once we fix a bug that isn’t interesting to discuss here). The Second part of the in-row part is a pointer to a location in memory where the BLOB is stored. So a row that has a BLOB in it looks something like this:

The size of the pointer is (of course) platform dependent. On 32bit machines it’s 4 bytes and on 64bit machines it’s 8 bytes.
Now, if I were any other source of documentation, I’d stop right here.
But I’m not. I’m a programmer writing a Storage Engine who now has the crucial question of memory management.
When your engine is given the row from the upper layer (such as doInsertRecord()/write_row()) you don’t have to worry, for the duration of the call, the memory will be there (don’t count on it being there after though, so if you’re not going to immediately splat it somewhere, make your own copy).
For reading, you are expected to provide a pointer to a location in memory that is valid until the next call to your Cursor. For example, rnd_next() call reads a BLOB field and your engine provides a pointer. At the subsequent rnd_next() call, it can free that pointer (or at doStopTableScan()/rnd_end()).
HOWEVER, this is true except for index_read_idx_map(), which in the default implementation in the Cursor (handler) base class ends up doing a doStartIndexScan(), index_read(), doEndIndexScan(). This means that if a BLOB was read, the engine could have (quite rightly) freed that memory already. In this case, you must keep the memory around until either a reset() or extra(HA_EXTRA_FLUSH) call.
This exception is tested (by accident) by a whole single query in type_blob.test – a monster of a query that’s about a seven way join with a group by and an order by. It would be quite possible to write a fairly functional engine and completely miss this.
Good luck.
This blog post (but not the whole blog) is published under the Creative Commons Attribution-Share Alike License. Attribution is by linking back to this post and mentioning my name (Stewart Smith).
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:13:"Stewart Smith";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:37;a:6:{s:4:"data";s:48:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:47:"Methods for searching errors in SQL application";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:72:"http://blogs.sun.com/svetasmirnova/entry/methods_for_searching_errors_in";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:72:"http://blogs.sun.com/svetasmirnova/entry/methods_for_searching_errors_in";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:512:"Some time ago I wrote in Russian language guide for finding errors in SQL application.To be honest I wrote it having personal aim to have a text which I can easily use refer in case of user questions about how to find particular thing. But this makes less sense having no English version. So now I started to translate it to English and publish. Introduction and first chapter are ready.You can find it at http://sql-error.microbecal.com/en/index.html Comments and corrections of mistakes are welcome here.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 26 May 2010 10:21:04 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:2:{i:0;a:5:{s:4:"data";s:8:"Personal";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:816:"Some time ago I wrote in Russian language guide for finding errors in SQL application.
To be honest I wrote it having personal aim to have a text which I can easily use refer in case of user questions about how to find particular thing. But this makes less sense having no English version. So now I started to translate it to English and publish. Introduction and first chapter are ready.
You can find it at http://sql-error.microbecal.com/en/index.html Comments and corrections of mistakes are welcome here.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:14:"Sveta Smirnova";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:38;a:6:{s:4:"data";s:53:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:25:"FlashCache: tpcc workload";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:43:"http://www.mysqlperformanceblog.com/?p=2899";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:72:"http://www.mysqlperformanceblog.com/2010/05/25/flashcache-tpcc-workload/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:2795:"This is my last post in series on FlashCache testing when the cache is placed on Intel SSD card.
This time I am using tpcc-like workload with 1000 Warehouses ( that gives 100GB of data) on Dell PowerEdge R900 with 32GB of RAM, 22GB allocated for buffer pool and I put 70GB on FlashCache partition ( just to simply test the case when data does not fit into cache partition).
Please note in this configuration the benchmark is very write intensive, and it is not going be easy for FlashCache, as in background it has to write blocks to RAID anyway, and write rate in final place is limited by RAID. So all performance benefits will come from read hits
The full report and results are available on benchmark Wiki
http://www.percona.com/docs/wiki/benchmark:flashcache:tpcc:start.
Short version of results are on graph:
In summary:
on RAID final result: 2556.592 TpmC
on Intel SSD X25-M: 7084.483 TpmC
on FlashCache with 20% dirty pages: 2632.892 TpmC
on FlashCache with 80% dirty pages: 4468.883 TpmC
So with 20% dirty pages the benefit are really miserable, and it is quite explainable ( see note above about write intensive workload), but really on the graph we can see that probably 2h was not enough to warmup FlashCache enough.
And this is interesting problem with FlashCache what I see. Warmup by simple copying files does not work (you need O_DIRECT with proper blocksize), and you only rely on InnoDB in this case, and it takes about 30min+ to fill FlashCache. Solution there would be PRELOAD TABLE / INDEX, and it is in our roadmap for XtraDB.
With 80% dirty pages the performance gain in 1.7x and it is pretty decent, as you can get it for 500$ ( price of Intel X25-M card) ( for this particular workload, your experience may vary!).
On this stage I consider FlashCache as pretty stable and ready for an evaluation on real workloads ( kudos to Facebook team, they provide really stable release).
I actually did pretty bad test – just turned off power on SSD drive in the middle of tpcc-mysql run,
just SSD power, not whole server. No wonder FlashCache complained on failed writes, and after that I restarted full system. I was expecting that database is going to trash, but after restart FlashCache was able to attach previous cache, and MySQL was able to start and finish crash recovery. I am impressed.
In my next rounds I am looking to run similar benchmarks on FusionIO card.
P.S. And if CentOS team reads this post – please change default IO scheduler from CFQ to Deadline. Seriously, it makes so much damage on performance on servers with IO intensive workloads, so it should be the first action after CentOS installation. And I doubt that there big usage of CentOS on desktop systems anyway.
Entry posted by Vadim |
10 comments
Add to: | | | | ";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 26 May 2010 05:04:13 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:3:{i:0;a:5:{s:4:"data";s:10:"benchmarks";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:6:"xtradb";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:5251:"This is my last post in series on FlashCache testing when the cache is placed on Intel SSD card.
This time I am using tpcc-like workload with 1000 Warehouses ( that gives 100GB of data) on Dell PowerEdge R900 with 32GB of RAM, 22GB allocated for buffer pool and I put 70GB on FlashCache partition ( just to simply test the case when data does not fit into cache partition).
Please note in this configuration the benchmark is very write intensive, and it is not going be easy for FlashCache, as in background it has to write blocks to RAID anyway, and write rate in final place is limited by RAID. So all performance benefits will come from read hits
The full report and results are available on benchmark Wiki
http://www.percona.com/docs/wiki/benchmark:flashcache:tpcc:start.
Short version of results are on graph:

In summary:
on RAID final result: 2556.592 TpmC
on Intel SSD X25-M: 7084.483 TpmC
on FlashCache with 20% dirty pages: 2632.892 TpmC
on FlashCache with 80% dirty pages: 4468.883 TpmC
So with 20% dirty pages the benefit are really miserable, and it is quite explainable ( see note above about write intensive workload), but really on the graph we can see that probably 2h was not enough to warmup FlashCache enough.
And this is interesting problem with FlashCache what I see. Warmup by simple copying files does not work (you need O_DIRECT with proper blocksize), and you only rely on InnoDB in this case, and it takes about 30min+ to fill FlashCache. Solution there would be PRELOAD TABLE / INDEX, and it is in our roadmap for XtraDB.
With 80% dirty pages the performance gain in 1.7x and it is pretty decent, as you can get it for 500$ ( price of Intel X25-M card) ( for this particular workload, your experience may vary!).
On this stage I consider FlashCache as pretty stable and ready for an evaluation on real workloads ( kudos to Facebook team, they provide really stable release).
I actually did pretty bad test – just turned off power on SSD drive in the middle of tpcc-mysql run,
just SSD power, not whole server. No wonder FlashCache complained on failed writes, and after that I restarted full system. I was expecting that database is going to trash, but after restart FlashCache was able to attach previous cache, and MySQL was able to start and finish crash recovery. I am impressed.
In my next rounds I am looking to run similar benchmarks on FusionIO card.
P.S. And if CentOS team reads this post – please change default IO scheduler from CFQ to Deadline. Seriously, it makes so much damage on performance on servers with IO intensive workloads, so it should be the first action after CentOS installation. And I doubt that there big usage of CentOS on desktop systems anyway.
Entry posted by Vadim |
10 comments
Add to:
|
|
|
| 
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:22:"MySQL Performance Blog";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:39;a:6:{s:4:"data";s:48:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:47:"Dirty pages, fast shutdown, and write combining";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:33:"http://www.xaprb.com/blog/?p=1873";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:83:"http://www.xaprb.com/blog/2010/05/25/dirty-pages-fast-shutdown-and-write-combining/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:7662:"One of the things that makes a traditional transactional database hard to make highly available is a relatively slow shutdown and start-up time. Applications typically delegate most or all writes to the database, which tends to run with a lot of “dirty” data in its (often large) memory. At shutdown time, the dirty memory needs to be written to disk, so the recovery routine doesn’t have to run at startup. And even upon a clean startup, the database probably has to warm up, which can also take a very long time.
Some databases let the operating system handle most of their memory management needs. This has its own challenges, especially if the operating system’s design doesn’t align exactly with the database’s goals. Other databases take matters into their own hands. InnoDB (the de facto transactional MySQL storage engine) falls into this category; when properly configured to take advantage of modern hardware, it will use basically all of the server’s memory in a huge buffer pool, with files opened in O_DIRECT mode, bypassing the operating system for I/O operations.
The design choices, and the results, are worth thinking about. Assuming you shut down and restart infrequently, the choice to hold a lot of dirty memory has huge performance benefits, which has to be balanced against the desire for fast shutdown and recovery. In InnoDB, there are a few things you can configure that change the startup and shutdown behaviors, but you should understand the performance effects during normal operation.
First, let’s look at why it’s nice to run with lots of dirty data in memory.
Write combining
Most databases have a concept called a page, buffer, or block. This is a physical unit of data, which can typically store many logical units (rows). InnoDB defaults to 16kb pages of data. Imagine that your typical row is only 80 bytes long. A lot of rows can fit into 16kb in most uses.
Suppose you insert, update, or delete a row. Should InnoDB write the result to disk? If it does, it has to write the entire 16kb page, and any other index pages and so forth, which can add up to a lot of pages. That’s a lot of work for a little bitty 80-byte row! InnoDB leaves the pages dirty in its memory. When you commit the transaction, the write-ahead log ensures that if there’s a crash, the change is still permanent. (The log has very compact entries and is not page-oriented.)
Now suppose you make another little change. In many cases, there’s a decent probability that both of the changes touched the same page(s). In fact, if you had the statistics to prove it, you would probably see that the vast majority of your changes focus on a small fraction of the total pages, or even a small fraction of the rows. Most workloads have a very tall head and a very long tail. Tens, hundreds, even thousands of times more changes go to those same pages and rows, as compared to the less-active ones.
Eventually, our favorite “hot page” gets written so a checkpoint can complete. Tons of changes were written in a single write. This is write combining, and it’s a huge efficiency. Huge! Servers can accept many tens of thousands of writes per second, and guarantee ACID properties, because of write combining. If they didn’t combine writes, they’d be asked to do many more I/O operations per second.
Dirty pages and the long tail
The downside to this is the amount of dirty pages in memory, which have to be written out during shutdown. Shutdown is equivalent to a forced checkpoint. The server has been lazily delaying lots of work, because it knows it’s going to be able to combine writes. Suddenly, all the bills come due at once — time to write tons of data to disk! And the problem here is that the server’s memory can actually be mostly dirty data. By default, InnoDB lets the buffer pool get up to 90% dirty before it starts to get worried and work hard to flush pages.
If most writes go to the hottest pages, why should there be so many dirty pages? The answer is the long tail. The few writes that don’t go to the tall head go to a very scattered long tail. Again this is hard to prove, but many of those one-off writes are dirtying entire pages just for themselves, and those pages will not be dirtied by any other writes. So the long tail is full of 16kb pages that had only 80 bytes written to them. This ends up being a lot of pages of data.
Fast shutdown on demand
If you want your database to be able to shut down quickly if needed, what can you do about this? This is a tough question to answer. There are a few different strategies you might take.
You can configure InnoDB to keep the dirty pages to a minimum. The problem is, it starts to do a lot less write combining. Take an average web application’s database and lower the dirty page percent, and watch the disk activity. It will go through the roof. It starts furiously flushing pages, only to turn around and flush the same pages again an instant later. InnoDB isn’t particularly designed or optimized for this, by the way. Things will suffer. However, this is actually a useful technique for a planned fast shutdown.
You can lower the page size. If you make the pages smaller, then in theory you’ll do less work flushing those long-tail pages. Be careful with this! There is research (actual math, mind you) indicating that InnoDB’s default page size is already too small, and there isn’t a lot of real-world experience with non-default page sizes. The Tokutek folks know a lot about the math, by the way.
You can configure InnoDB not to flush dirty pages before a shutdown. This is essentially the same thing as shutting down without a checkpoint, which is the same as crashing. The recovery routine will have to run at startup before the database becomes available. That is likely to be much slower than a clean shutdown, due to the mechanics of crash recovery.
You might be able to make InnoDB capable of doing a lot more flushing by upgrading to a version that has separate threads for this purpose, and/or using native asynchronous I/O. This might not really help in shutdown; to tell the truth, I haven’t checked it.
No free lunch
InnoDB is a complex system that is trying to balance a lot of different factors for efficiency, while giving nice ACID properties. And it’s actually doing a pretty decent job of it by default. When you say you’d like more or less of such-and-such performance characteristic, then something else gets traded off. This is a really hard problem, and I’m not aware of anyone who has a brilliant solution to it, although I am far from a database research specialist.
Even the question of how much data to write, and how quickly, is a hard one. It’s hard and expensive to really answer accurately because the real answer requires knowledge of things such the frequency and distribution of page dirtying. Therefore, InnoDB kind of avoids this and lets you configure its “I/O capacity” and “dirty page percent” and maybe a few other things, depending on which version you use. These are just models that approximate the true answers to the real questions. All models are wrong. Some models are useful. InnoDB employs useful models that work a lot of the time.
Related posts:How fast is MySQL Table Checksum?A very fast FNV hash function for MySQLHow fast is MySQL replication?Get Maatkit fast from the command lineHow to write efficient archiving and purging jobs in SQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Wed, 26 May 2010 02:24:19 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:2:{i:0;a:5:{s:4:"data";s:3:"SQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:6:"InnoDB";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:9243:"One of the things that makes a traditional transactional database hard to make highly available is a relatively slow shutdown and start-up time. Applications typically delegate most or all writes to the database, which tends to run with a lot of “dirty” data in its (often large) memory. At shutdown time, the dirty memory needs to be written to disk, so the recovery routine doesn’t have to run at startup. And even upon a clean startup, the database probably has to warm up, which can also take a very long time.
Some databases let the operating system handle most of their memory management needs. This has its own challenges, especially if the operating system’s design doesn’t align exactly with the database’s goals. Other databases take matters into their own hands. InnoDB (the de facto transactional MySQL storage engine) falls into this category; when properly configured to take advantage of modern hardware, it will use basically all of the server’s memory in a huge buffer pool, with files opened in O_DIRECT mode, bypassing the operating system for I/O operations.
The design choices, and the results, are worth thinking about. Assuming you shut down and restart infrequently, the choice to hold a lot of dirty memory has huge performance benefits, which has to be balanced against the desire for fast shutdown and recovery. In InnoDB, there are a few things you can configure that change the startup and shutdown behaviors, but you should understand the performance effects during normal operation.
First, let’s look at why it’s nice to run with lots of dirty data in memory.
Write combining
Most databases have a concept called a page, buffer, or block. This is a physical unit of data, which can typically store many logical units (rows). InnoDB defaults to 16kb pages of data. Imagine that your typical row is only 80 bytes long. A lot of rows can fit into 16kb in most uses.
Suppose you insert, update, or delete a row. Should InnoDB write the result to disk? If it does, it has to write the entire 16kb page, and any other index pages and so forth, which can add up to a lot of pages. That’s a lot of work for a little bitty 80-byte row! InnoDB leaves the pages dirty in its memory. When you commit the transaction, the write-ahead log ensures that if there’s a crash, the change is still permanent. (The log has very compact entries and is not page-oriented.)
Now suppose you make another little change. In many cases, there’s a decent probability that both of the changes touched the same page(s). In fact, if you had the statistics to prove it, you would probably see that the vast majority of your changes focus on a small fraction of the total pages, or even a small fraction of the rows. Most workloads have a very tall head and a very long tail. Tens, hundreds, even thousands of times more changes go to those same pages and rows, as compared to the less-active ones.
Eventually, our favorite “hot page” gets written so a checkpoint can complete. Tons of changes were written in a single write. This is write combining, and it’s a huge efficiency. Huge! Servers can accept many tens of thousands of writes per second, and guarantee ACID properties, because of write combining. If they didn’t combine writes, they’d be asked to do many more I/O operations per second.
Dirty pages and the long tail
The downside to this is the amount of dirty pages in memory, which have to be written out during shutdown. Shutdown is equivalent to a forced checkpoint. The server has been lazily delaying lots of work, because it knows it’s going to be able to combine writes. Suddenly, all the bills come due at once — time to write tons of data to disk! And the problem here is that the server’s memory can actually be mostly dirty data. By default, InnoDB lets the buffer pool get up to 90% dirty before it starts to get worried and work hard to flush pages.
If most writes go to the hottest pages, why should there be so many dirty pages? The answer is the long tail. The few writes that don’t go to the tall head go to a very scattered long tail. Again this is hard to prove, but many of those one-off writes are dirtying entire pages just for themselves, and those pages will not be dirtied by any other writes. So the long tail is full of 16kb pages that had only 80 bytes written to them. This ends up being a lot of pages of data.
Fast shutdown on demand
If you want your database to be able to shut down quickly if needed, what can you do about this? This is a tough question to answer. There are a few different strategies you might take.
- You can configure InnoDB to keep the dirty pages to a minimum. The problem is, it starts to do a lot less write combining. Take an average web application’s database and lower the dirty page percent, and watch the disk activity. It will go through the roof. It starts furiously flushing pages, only to turn around and flush the same pages again an instant later. InnoDB isn’t particularly designed or optimized for this, by the way. Things will suffer. However, this is actually a useful technique for a planned fast shutdown.
- You can lower the page size. If you make the pages smaller, then in theory you’ll do less work flushing those long-tail pages. Be careful with this! There is research (actual math, mind you) indicating that InnoDB’s default page size is already too small, and there isn’t a lot of real-world experience with non-default page sizes. The Tokutek folks know a lot about the math, by the way.
- You can configure InnoDB not to flush dirty pages before a shutdown. This is essentially the same thing as shutting down without a checkpoint, which is the same as crashing. The recovery routine will have to run at startup before the database becomes available. That is likely to be much slower than a clean shutdown, due to the mechanics of crash recovery.
- You might be able to make InnoDB capable of doing a lot more flushing by upgrading to a version that has separate threads for this purpose, and/or using native asynchronous I/O. This might not really help in shutdown; to tell the truth, I haven’t checked it.
No free lunch
InnoDB is a complex system that is trying to balance a lot of different factors for efficiency, while giving nice ACID properties. And it’s actually doing a pretty decent job of it by default. When you say you’d like more or less of such-and-such performance characteristic, then something else gets traded off. This is a really hard problem, and I’m not aware of anyone who has a brilliant solution to it, although I am far from a database research specialist.
Even the question of how much data to write, and how quickly, is a hard one. It’s hard and expensive to really answer accurately because the real answer requires knowledge of things such the frequency and distribution of page dirtying. Therefore, InnoDB kind of avoids this and lets you configure its “I/O capacity” and “dirty page percent” and maybe a few other things, depending on which version you use. These are just models that approximate the true answers to the real questions. All models are wrong. Some models are useful. InnoDB employs useful models that work a lot of the time.
Related posts:
- How fast is MySQL Table Checksum?
- A very fast FNV hash function for MySQL
- How fast is MySQL replication?
- Get Maatkit fast from the command line
- How to write efficient archiving and purging jobs in SQL
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:22:"Baron Schwartz (xaprb)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:40;a:6:{s:4:"data";s:58:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:20:"Down the rabbit hole";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:32:"http://hackmysql.com/blog/?p=112";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:58:"http://hackmysql.com/blog/2010/05/25/down-the-rabbit-hole/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:2274:"Generally I avoid going down rabbit holes but today I decided to see how deep a particular testing rabbit hole went. This post is a third in what seems be a continuing series of programming anecdotes. It’s not particularly MySQL-related so you can stop reading here unless you grok code stuff.
Before beginning work on issue 720 I ran the mk-table-checksum test suite to make sure it was in working order. No sense writing new tests and code when the old tests and code aren’t reliable. I actually made one seemingly innocuous change to the test suite in preparation for the issue: I changed the –replicate checksum table from MyISAM to InnoDB.
Surprisingly, the test suite proved unstable. Random tests would fail at random times. Some instability was due to new tests for other issues that I wrote poorly and hadn’t been run a zillion times yet. But other instability was due to switching the checksum table to InnoDB. I knew this because I could switch it back to InnoDB and the tests were ok. Thus began my descent into the rabbit hole.
In particular the test for issue 982 was not finishing or, if it did, it took forever. I discovered that the problem had to do with –lock even though it shouldn’t since the docu says that –replicate and –lock are useless together because the former eliminates locking concerns. Obviously not. So for this I created issue 1027.
Then I turned my attention to test 207_issue_51.t for issue 51 which was opened 2 years ago and closed 1 year ago as WontFix because the problem could not be reproduced. Some might have thought it forgotten amongst the nearly 1,000 issues since its time, but it was found again at the end of this rabbit hole and revivified.
It turns out that issue 51 is, as the original reporter stated (partially), caused by –wait when (this is the part the original reporter didn’t state) the –replicate checksum table is InnoDB because –wait enables –lock which disables AutoCommit so writes to the checksum table may not be committed.
So the rabbit hole lead from issue 720 to issue 982 to issue 1027 to issue 51. Then end results will be a needed bug fix and faster, more stable tests. Sometimes it pays to follow the rabbit.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 25 May 2010 22:28:43 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:4:{i:0;a:5:{s:4:"data";s:11:"Programming";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:17:"mk-table-checksum";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:6:"rabbit";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:7:"testing";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:2870:"Generally I avoid going down rabbit holes but today I decided to see how deep a particular testing rabbit hole went. This post is a third in what seems be a continuing series of programming anecdotes. It’s not particularly MySQL-related so you can stop reading here unless you grok code stuff.
Before beginning work on issue 720 I ran the mk-table-checksum test suite to make sure it was in working order. No sense writing new tests and code when the old tests and code aren’t reliable. I actually made one seemingly innocuous change to the test suite in preparation for the issue: I changed the –replicate checksum table from MyISAM to InnoDB.
Surprisingly, the test suite proved unstable. Random tests would fail at random times. Some instability was due to new tests for other issues that I wrote poorly and hadn’t been run a zillion times yet. But other instability was due to switching the checksum table to InnoDB. I knew this because I could switch it back to InnoDB and the tests were ok. Thus began my descent into the rabbit hole.
In particular the test for issue 982 was not finishing or, if it did, it took forever. I discovered that the problem had to do with –lock even though it shouldn’t since the docu says that –replicate and –lock are useless together because the former eliminates locking concerns. Obviously not. So for this I created issue 1027.
Then I turned my attention to test 207_issue_51.t for issue 51 which was opened 2 years ago and closed 1 year ago as WontFix because the problem could not be reproduced. Some might have thought it forgotten amongst the nearly 1,000 issues since its time, but it was found again at the end of this rabbit hole and revivified.
It turns out that issue 51 is, as the original reporter stated (partially), caused by –wait when (this is the part the original reporter didn’t state) the –replicate checksum table is InnoDB because –wait enables –lock which disables AutoCommit so writes to the checksum table may not be committed.
So the rabbit hole lead from issue 720 to issue 982 to issue 1027 to issue 51. Then end results will be a needed bug fix and faster, more stable tests. Sometimes it pays to follow the rabbit.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:14:"Daniel Nichter";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:41;a:6:{s:4:"data";s:58:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:49:"Disk seeks are evil, so let’s avoid them, pt. 2";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:26:"http://tokutek.com/?p=1484";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:71:"http://tokutek.com/2010/05/disk-seeks-are-evil-so-lets-avoid-them-pt-2/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:3245:"
In part 1, I discussed why having many disk seeks are bad (they slow down performance), and how fractal tree data structures minimize disk seeks on ad-hoc insertions, whereas B-trees practically guarantee that disk seeks are performed on ad-hoc insertions. As a result, fractal tree data structures can insert data up to two orders of magnitude faster than B-Trees can.
Now that insertion disk seeks are out of the way (and I don’t want to shortchange the importance of getting rid of these seeks!), let’s look at other places where databases perform seeks, and see if we can get rid of them. Over my next couple of posts, I will look at several use cases and analyze whether disk seeks are required. If disk seeks are required, then performance will suffer on large amounts of data, for TokuDB and any other disk-based storage engines.
If disk seeks are not required, things get interesting. Removing these unnecessary disk can speed up a database as long as all disk seeks in a command execution are removed. Since TokuDB eliminates seeks on insertions, we should avoid disk seeks altogether. Since B-trees induce disk seeks on ad-hoc insertions, cleaning up the remaining disk seeks has limited utility.
For today, let’s look at a simple use case that may be obvious: insertions on secondary indexes v. unique secondary indexes. Take the following table:
Create Table: CREATE TABLE `t` (
`a` int(11) NOT NULL AUTO_INCREMENT,
`b` int(11) DEFAULT NULL,
`c` int(11) DEFAULT NULL,
PRIMARY KEY (`a`),
UNIQUE KEY `b_unique` (`b`),
KEY `b_norm` (`b`)
)
Suppose most of the table resides on disk.
Now I run:
insert into t (b) values (1000);
Are there any mandatory disk seeks involved?
When inserting into the fractal tree for the primary dictionary, we use an auto increment value, so insertions are sequential. Insertions run really fast, because a disk seek is usually not mandatory (disk seeks eventually happen when blocks get full, but they do not occur on EACH insertion).
Let’s look at inserting into b_unique and b_norm visually. Take the following identical fractal trees for ‘b_unique’ and ‘b_norm’:
-
- -
- - - -
...
1, 3, 5, 7, ..., 999, 1001, 1003, ...
To insert into ‘b_norm’, the fractal tree can simply insert (1000) in the top node. To insert into ‘b_unique’, the fractal tree must first search for 1000, verify that it is not between 999 and 1001, and then insert into the top node. This lookup causes a disk seek and slows down the insertion.
Note that because B-trees require disk seeks to do insertions anyway, some operations come with no additional cost in B-trees. Uniqueness checks are one such example. As a result, some B-tree users may not think twice about making secondary keys unique (after all, unique keys can help the query optimizer). Fractal tree data structures, on the other hand, incur a huge cost for a uniqueness check.
So, the moral of this story is if you care about insertion performance and avoiding disk seeks, try to avoid unique secondary keys, and go with normal secondary keys. Otherwise, fractal tree data structures will be just as slow as B-trees, and not two orders of magnitude faster.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 25 May 2010 20:23:26 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:4:{i:0;a:5:{s:4:"data";s:8:"TokuView";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:13:"Fractal Trees";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:6:"TokuDB";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:3663:"
In part 1, I discussed why having many disk seeks are bad (they slow down performance), and how fractal tree data structures minimize disk seeks on ad-hoc insertions, whereas B-trees practically guarantee that disk seeks are performed on ad-hoc insertions. As a result, fractal tree data structures can insert data up to two orders of magnitude faster than B-Trees can.
Now that insertion disk seeks are out of the way (and I don’t want to shortchange the importance of getting rid of these seeks!), let’s look at other places where databases perform seeks, and see if we can get rid of them. Over my next couple of posts, I will look at several use cases and analyze whether disk seeks are required. If disk seeks are required, then performance will suffer on large amounts of data, for TokuDB and any other disk-based storage engines.
If disk seeks are not required, things get interesting. Removing these unnecessary disk can speed up a database as long as all disk seeks in a command execution are removed. Since TokuDB eliminates seeks on insertions, we should avoid disk seeks altogether. Since B-trees induce disk seeks on ad-hoc insertions, cleaning up the remaining disk seeks has limited utility.
For today, let’s look at a simple use case that may be obvious: insertions on secondary indexes v. unique secondary indexes. Take the following table:
Create Table: CREATE TABLE `t` (
`a` int(11) NOT NULL AUTO_INCREMENT,
`b` int(11) DEFAULT NULL,
`c` int(11) DEFAULT NULL,
PRIMARY KEY (`a`),
UNIQUE KEY `b_unique` (`b`),
KEY `b_norm` (`b`)
)
Suppose most of the table resides on disk.
Now I run:
insert into t (b) values (1000);
Are there any mandatory disk seeks involved?
When inserting into the fractal tree for the primary dictionary, we use an auto increment value, so insertions are sequential. Insertions run really fast, because a disk seek is usually not mandatory (disk seeks eventually happen when blocks get full, but they do not occur on EACH insertion).
Let’s look at inserting into b_unique and b_norm visually. Take the following identical fractal trees for ‘b_unique’ and ‘b_norm’:
-
- -
- - - -
...
1, 3, 5, 7, ..., 999, 1001, 1003, ...
To insert into ‘b_norm’, the fractal tree can simply insert (1000) in the top node. To insert into ‘b_unique’, the fractal tree must first search for 1000, verify that it is not between 999 and 1001, and then insert into the top node. This lookup causes a disk seek and slows down the insertion.
Note that because B-trees require disk seeks to do insertions anyway, some operations come with no additional cost in B-trees. Uniqueness checks are one such example. As a result, some B-tree users may not think twice about making secondary keys unique (after all, unique keys can help the query optimizer). Fractal tree data structures, on the other hand, incur a huge cost for a uniqueness check.
So, the moral of this story is if you care about insertion performance and avoiding disk seeks, try to avoid unique secondary keys, and go with normal secondary keys. Otherwise, fractal tree data structures will be just as slow as B-trees, and not two orders of magnitude faster.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:13:"Tokuview Blog";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:42;a:6:{s:4:"data";s:163:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:25:"451 CAOS Links 2010.05.25";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:47:"http://blogs.the451group.com/opensource/?p=1817";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:60:"http://feedproxy.google.com/~r/451opensource/~3/fyOiMBkLirc/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:2326:"What’s missing from WebM? VoltDB launches. The importance of profitability. And more.
Follow 451 CAOS Links live @caostheory on Twitter and Identi.ca
“Tracking the open source news wires, so you don’t have to.”
# Simon Phipps examined what’s missing from WebM, from an open source perspective.
# Mike Stonebraker’s VoltDB officially launched its open source in-memory OLTP database.
# Jim Whitehurst argued that one of Red Hat’s most valuable contributions to open source is its profitability.
# Infobright appointed former Aleri CEO Don DeLoach as its new president and chief executive.
# Monty Program launched an Unlimited support offering for a company’s entire MySQL/MariaDB estate.
# Red Hat has announced the availability of Fedora 13.
# Terracotta claimed 100 customers have upgraded to the enterprise edition of Ehcache in the last 10 months.
# Stéphane Croisier discussed the future of open source CMS, and the future of open core.
# Pogo Linux released a new line of StorageDirector Z2 Foundation and StorageDirector Z2 HA Cluster products.
# Couchio started testing a hosted CouchDB service.
# A group of implementers of the open source ERP application ADempiere formed ADempiere Business Consultants.
# Simon Phipps argued the case for the continuing relevance of the Open Source Initiative.
# Red Hat’s Paul Cormier disputed Oracle’s open source credentials.
# BitTorrent released an open source implementation of its µTP protocol.
# Microsoft released two new open source projects for interoperability with Outlook.
# Carlo Daffara discussed the limited potential in trying to convert open source users into paying customers.
# When should you use Hadoop? Cloudera’s Jeff Bean offered some suggestions.
# Andrew Oliver argued that for Microsoft, open source means “Windows Encumbered” although without examples.
# While Mark Stone argued in favor of constructive engagement between open source and Microsoft.
# ibatis has become MyBatis and moved from Apache to Google Code.
# Who will build the LAMP cloud? Or does cloud computing need LAMP?
# CIO Update reported on Red Hat’s plans to commercialize deltaCloud.
# Linux trading system to save London Stock Exchange £10m a year, Computerworld reported.
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 25 May 2010 18:50:29 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:25:{i:0;a:5:{s:4:"data";s:8:"Software";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:9:"451 group";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:13:"451caostheory";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:8:"451group";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:9:"adempiere";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:13:"andrew oliver";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:10:"bittorrent";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:7;a:5:{s:4:"data";s:10:"caostheory";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:8;a:5:{s:4:"data";s:13:"Carlo daffara";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:9;a:5:{s:4:"data";s:8:"cloudera";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:10;a:5:{s:4:"data";s:7:"couchdb";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:11;a:5:{s:4:"data";s:7:"couchio";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:12;a:5:{s:4:"data";s:10:"deltacloud";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:13;a:5:{s:4:"data";s:7:"EHCache";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:14;a:5:{s:4:"data";s:6:"google";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:15;a:5:{s:4:"data";s:6:"Hadoop";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:16;a:5:{s:4:"data";s:6:"ibatis";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:17;a:5:{s:4:"data";s:10:"infobright";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:18;a:5:{s:4:"data";s:14:"jim whitehurst";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:19;a:5:{s:4:"data";s:5:"Linux";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:20;a:5:{s:4:"data";s:21:"london stock exchange";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:21;a:5:{s:4:"data";s:7:"mariadb";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:22;a:5:{s:4:"data";s:10:"mark stone";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:23;a:5:{s:4:"data";s:11:"matt aslett";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:24;a:5:{s:4:"data";s:3:"mat";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:3749:"What’s missing from WebM? VoltDB launches. The importance of profitability. And more.
Follow 451 CAOS Links live @caostheory on Twitter and Identi.ca
“Tracking the open source news wires, so you don’t have to.”
# Simon Phipps examined what’s missing from WebM, from an open source perspective.
# Mike Stonebraker’s VoltDB officially launched its open source in-memory OLTP database.
# Jim Whitehurst argued that one of Red Hat’s most valuable contributions to open source is its profitability.
# Infobright appointed former Aleri CEO Don DeLoach as its new president and chief executive.
# Monty Program launched an Unlimited support offering for a company’s entire MySQL/MariaDB estate.
# Red Hat has announced the availability of Fedora 13.
# Terracotta claimed 100 customers have upgraded to the enterprise edition of Ehcache in the last 10 months.
# Stéphane Croisier discussed the future of open source CMS, and the future of open core.
# Pogo Linux released a new line of StorageDirector Z2 Foundation and StorageDirector Z2 HA Cluster products.
# Couchio started testing a hosted CouchDB service.
# A group of implementers of the open source ERP application ADempiere formed ADempiere Business Consultants.
# Simon Phipps argued the case for the continuing relevance of the Open Source Initiative.
# Red Hat’s Paul Cormier disputed Oracle’s open source credentials.
# BitTorrent released an open source implementation of its µTP protocol.
# Microsoft released two new open source projects for interoperability with Outlook.
# Carlo Daffara discussed the limited potential in trying to convert open source users into paying customers.
# When should you use Hadoop? Cloudera’s Jeff Bean offered some suggestions.
# Andrew Oliver argued that for Microsoft, open source means “Windows Encumbered” although without examples.
# While Mark Stone argued in favor of constructive engagement between open source and Microsoft.
# ibatis has become MyBatis and moved from Apache to Google Code.
# Who will build the LAMP cloud? Or does cloud computing need LAMP?
# CIO Update reported on Red Hat’s plans to commercialize deltaCloud.
# Linux trading system to save London Stock Exchange £10m a year, Computerworld reported.

PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:13:"The 451 Group";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:43;a:6:{s:4:"data";s:38:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:5:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:39:"Drizzle, Dexter, Beta is on the horizon";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:39:"http://krow.livejournal.com/690635.html";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:39:"http://krow.livejournal.com/690635.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:2640:"The latest news for Drizzle!We completed the Cherry roadmap about a month ago. Right before the MySQL User's Conference. Cherry was out last big release before we were to begin work on finalizing the Beta (aka Dexter). What is in Dexter?The Dexter release is all about stabilizing Drizzle so that we can get a Beta shipped out at the end of the summer. Stewart has been working on switching our core Innodb to use the downstream version of the Inndb Embedded Engine (aka HailDB. You can read more about HailDB here. With much of the recent speculation and rumors around the demise of the Innodb Emebdded Engine, HailDB gives the open source community a stable future going forward, and it opens up development to the project in a more organic method. More public testing, and additional developers will lead to a better codebase for development to happen.We recently added another full-time person, Patrick Crews to our midst who is hard at work doing QA for us. Patrick is extending Randgen for drizzle, a testing suite built to hammer databases for stability. He is also working on extending our testing for our replication system, and in general making sure we are doing our utmost to test Drizzle.The new, entirely ANSI, information schema will be included alongside our federated data dictionary which includes new table functions that allow you to see more of the internal state of Innodb and the Drizzle microkernel. A number of performance enhancements are being done, and the fail safe ALTER TABLE code is going in. You will never have to worry again that a crash during an ALTER TABLE will either end up with corrupted tables or lost table spaces. Rollback on DDL becomes a natural operation for Drizzle when using Innodb.Recently PBXT was added to Drizzle, and Paul has been working on optimizing PBXT for the Drizzle architecture. Primebase, the company behind PBXT, also recently contributed the new events observer code to Drizzle. This allows us to place triggers on DDL, DML, and other operations. We have eight Google Summer of Code students working on different projects for the summer. New unit testing, boost command line and file configuration, and engines to access cloud services are just a few of the projects being done by students this year. All of this is being packaged up for the latest distributions so new debian/ubuntu and RPM packages are on their way.Next week there are two talks at Open Source Bridges on Drizzle. One I am giving, plus another being given by Padraig O'sullivan on Developing Replication Plugins for Drizzle. Later in the summer we have another talk at O'Reilly's Open Source Conference.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 25 May 2010 17:54:03 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:3460:"The latest news for Drizzle!
We completed the Cherry roadmap about a month ago. Right before the MySQL User's Conference. Cherry was out last big release before we were to begin work on finalizing the Beta (aka Dexter).
What is in Dexter?
The Dexter release is all about stabilizing Drizzle so that we can get a Beta shipped out at the end of the summer. Stewart has been working on switching our core Innodb to use the downstream version of the Inndb Embedded Engine (aka HailDB. You can read more about HailDB here. With much of the recent speculation and rumors around the demise of the Innodb Emebdded Engine, HailDB gives the open source community a stable future going forward, and it opens up development to the project in a more organic method. More public testing, and additional developers will lead to a better codebase for development to happen.
We recently added another full-time person, Patrick Crews to our midst who is hard at work doing QA for us. Patrick is extending Randgen for drizzle, a testing suite built to hammer databases for stability. He is also working on extending our testing for our replication system, and in general making sure we are doing our utmost to test Drizzle.
The new, entirely ANSI, information schema will be included alongside our federated data dictionary which includes new table functions that allow you to see more of the internal state of Innodb and the Drizzle microkernel. A number of performance enhancements are being done, and the fail safe ALTER TABLE code is going in. You will never have to worry again that a crash during an ALTER TABLE will either end up with corrupted tables or lost table spaces. Rollback on DDL becomes a natural operation for Drizzle when using Innodb.
Recently PBXT was added to Drizzle, and Paul has been working on optimizing PBXT for the Drizzle architecture. Primebase, the company behind PBXT, also recently contributed the new events observer code to Drizzle. This allows us to place triggers on DDL, DML, and other operations.
We have eight Google Summer of Code students working on different projects for the summer. New unit testing, boost command line and file configuration, and engines to access cloud services are just a few of the projects being done by students this year.
All of this is being packaged up for the latest distributions so new debian/ubuntu and RPM packages are on their way.
Next week there are two talks at Open Source Bridges on Drizzle. One I am giving, plus another being given by Padraig O'sullivan on Developing Replication Plugins for Drizzle. Later in the summer we have another talk at O'Reilly's Open Source Conference.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:10:"Brian Aker";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:44;a:6:{s:4:"data";s:58:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:37:"A better way to build Cacti templates";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:33:"http://www.xaprb.com/blog/?p=1869";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:75:"http://www.xaprb.com/blog/2010/05/25/a-better-way-to-build-cacti-templates/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:1520:"The traditional way to build Cacti templates is through the Cacti web interface. This is an enormous amount of work, and the result is generally not very consistent or good quality. The process is too error-prone. You can export the templates as XML, but they tend to have problems such as version incompatibilities with other Cacti installations, and it’s hard to adapt them for user preferences such as different graph image sizes and polling intervals.
The way I build Cacti templates is exactly the opposite. I create a data structure in a file, which looks like many configuration file syntaxes you’ve probably worked with. It represents the graphs, templates, scripts, and so on. From this, a tool generates the XML template file, which is a universal template definition, and is a breeze to import into Cacti. It is completely consistent and has zero cruft in it. This process prevents errors, and the results are perfect every time. (There’s a test suite, by the way.)
All the tools, documentation, how-tos, examples, and pre-fabricated scripts and templates you need are at the Better Cacti Templates open-source project. If you want to build your own templates, pay special attention to the documentation on creating graphs.
Related posts:Version 1.1.4 of improved Cacti templates releasedMySQL Cacti templates version 1.1.1 releasedVersion 1.1.5 of improved Cacti templates releasedVersion 1.1.7 of Better Cacti Templates releasedVersion 1.1.6 of Better Cacti Templates released";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 25 May 2010 13:56:53 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:4:{i:0;a:5:{s:4:"data";s:5:"Cacti";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:3:"SQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:9:"Sys Admin";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:5:"Tools";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:2914:"The traditional way to build Cacti templates is through the Cacti web interface. This is an enormous amount of work, and the result is generally not very consistent or good quality. The process is too error-prone. You can export the templates as XML, but they tend to have problems such as version incompatibilities with other Cacti installations, and it’s hard to adapt them for user preferences such as different graph image sizes and polling intervals.
The way I build Cacti templates is exactly the opposite. I create a data structure in a file, which looks like many configuration file syntaxes you’ve probably worked with. It represents the graphs, templates, scripts, and so on. From this, a tool generates the XML template file, which is a universal template definition, and is a breeze to import into Cacti. It is completely consistent and has zero cruft in it. This process prevents errors, and the results are perfect every time. (There’s a test suite, by the way.)
All the tools, documentation, how-tos, examples, and pre-fabricated scripts and templates you need are at the Better Cacti Templates open-source project. If you want to build your own templates, pay special attention to the documentation on creating graphs.
Related posts:
- Version 1.1.4 of improved Cacti templates released
- MySQL Cacti templates version 1.1.1 released
- Version 1.1.5 of improved Cacti templates released
- Version 1.1.7 of Better Cacti Templates released
- Version 1.1.6 of Better Cacti Templates released
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:22:"Baron Schwartz (xaprb)";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:45;a:6:{s:4:"data";s:38:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:5:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:38:"The reason it's been quiet: MyQuery...";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:70:"tag:blogger.com,1999:blog-9144505959002328789.post-9002652283289581571";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:82:"http://karlssonondatabases.blogspot.com/2010/05/reason-its-been-quiet-myquery.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:1365:"I don't know if you have used my MyQuery query tool, which is a MySQL Query and Scripting tool for Windows environments, but that is what has taken up most of my spare time recently. It started with an idea of adding a feature to be able to run any custom SQL SELECT in a non-modal dialog, for customized monitoring of MySQL.This turned into a much more extensive custimization feature, enabling scripts, plugins, external programs and web-links to extend MyQuery. I'm now pretty close to code complete with this feature, and I think it's rather cool. But in the process of getting this done, and to enable as many features as possible in the extensions, I had to clean up quite a bit of code in other places, where things just weren't as self-contained as I had wanted them to be.The main things that remains to be done before releasing MyQuery 3.3 is documentation, testing and some cleanup. And I don't know if you've noticed, but doing your own documentation is a good thing, as it foces you to revisit what you have done, but from another angle, and allows some silly bugs to be fixed. And if you haven't used MyQuery, you can see that although the documentation isn't the most organized around, it's pretty comprehensive.Once MyQuery 3.3 is out the door, I'll get back to do some benchmarking and to make the promissed push for GIS features in MySQL./Karlsson";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 25 May 2010 11:24:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:4152:"I don't know if you have used my MyQuery query tool, which is a MySQL Query and Scripting tool for Windows environments, but that is what has taken up most of my spare time recently. It started with an idea of adding a feature to be able to run any custom SQL SELECT in a non-modal dialog, for customized monitoring of MySQL.
This turned into a much more extensive custimization feature, enabling scripts, plugins, external programs and web-links to extend MyQuery. I'm now pretty close to code complete with this feature, and I think it's rather cool. But in the process of getting this done, and to enable as many features as possible in the extensions, I had to clean up quite a bit of code in other places, where things just weren't as self-contained as I had wanted them to be.
The main things that remains to be done before releasing MyQuery 3.3 is documentation, testing and some cleanup. And I don't know if you've noticed, but doing your own documentation is a good thing, as it foces you to revisit what you have done, but from another angle, and allows some silly bugs to be fixed. And if you haven't used MyQuery, you can see that although the documentation isn't the most organized around, it's pretty comprehensive.
Once MyQuery 3.3 is out the door, I'll get back to do some benchmarking and to make the promissed push for GIS features in MySQL.
/Karlsson
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:15:"Anders Karlsson";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:46;a:6:{s:4:"data";s:58:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:47:"MySQL, Oracle and NoSQL: In the grand scheme...";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:59:"tag:blogger.com,1999:blog-15319370.post-8185806028367157881";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:80:"http://rpbouman.blogspot.com/2010/05/mysql-oracle-and-nosql-in-grand-scheme.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:860:"...NoSQL is just larger than a fly's dropping, and MySQL and Oracle are more alike than either of their respective fanboys would like to admit. Courtesy of Google trends:I guess I won't be changing my career just yet.UPDATE: I tried a few terms for "Microsoft SQL Server" before posting (SQL Server, MS SQL) but found none that came up with what I felt like was a realistic volume (they are all much, much lower than I expected). @MarkGStacey suggested trying "SQL 2008", "SQL 2005" and "SQL 2000", and those return much better results indeed (though still much lower than MySQL or Oracle). Anyway - I'd love to have some way of bunching up all those terms and have Google Trends show them as one trend, but I haven't figured out a way to do that. If you know how, please drop a line at let me know. I'll adjust the blog if I find a more satisfactory solution.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 25 May 2010 09:00:00 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:4:{i:0;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:10:"PostgreSQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:5:"NoSQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:6:"Oracle";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:1536:"...NoSQL is just larger than a fly's dropping, and MySQL and Oracle are more alike than either of their respective fanboys would like to admit.
Courtesy of Google trends:
I guess I won't be changing my career just yet.
UPDATE: I tried a few terms for "Microsoft SQL Server" before posting (SQL Server, MS SQL) but found none that came up with what I felt like was a realistic volume (they are all much, much lower than I expected).
@MarkGStacey suggested trying "SQL 2008", "SQL 2005" and "SQL 2000", and those return much better results indeed (though still much lower than MySQL or Oracle). Anyway - I'd love to have some way of bunching up all those terms and have Google Trends show them as one trend, but I haven't figured out a way to do that. If you know how, please drop a line at let me know.
I'll adjust the blog if I find a more satisfactory solution.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:13:"Roland Bouman";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:47;a:6:{s:4:"data";s:68:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:88:"Intra-query parallelism for MySQL queries without an appliance or closed source database";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:43:"http://swanhart.livejournal.com/132947.html";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:43:"http://swanhart.livejournal.com/132947.html";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:6066:"*edit* I want to point out that this test was done on a single database server which used MySQL partitioning. This is a demonstration of how Shard-Query can improve performance in non-sharded databases too.*edit*.Over the weekend I spent a lot of time improving my new Shard-Query tool (code.google.com/p/shard-query) and the improvements can equate to big performance gains on partitioned data sets versus executing the query directly on MySQL.I'll explain this graph below, but lower is better (response time) and Shard-Query is the red line.MySQL understands that queries which access data in only certain partitions don't have to read the rest of the table. This partition elimination works well, but MySQL left a big optimization out of partitioning: getting data in parallel. In fact, since partition elimination is the only major optimization provided by the partition options it isn't great for scaling access to large data sets when the entire data set must be accessed, but only when smaller parts of a the set are examined. Since Shard-Query exploits parallelism with Gearman (http://www.gearman.org) I decided to extend the Shard-Query "optimizer" to support running queries with IN lists in parallel. This makes a query scale much further than it would if there was no parallelism at work.Consider the table following partitioned fact table:
CREATE TABLE `fact` (
`id` bigint(20) unsigned DEFAULT NULL,
`a_id` bigint(20) unsigned DEFAULT NULL,
`b_id` int(11) NOT NULL,
`c_id` int(11) NOT NULL,
`i1` tinyint(4) DEFAULT NULL,
`qty` smallint(6) DEFAULT NULL,
`score` decimal(10,10) DEFAULT NULL,
`price` decimal(7,3) DEFAULT NULL,
`i2` int(11) DEFAULT NULL,
`i3` int(11) DEFAULT NULL,
`wide_row` char(54) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1
/*!50100 PARTITION BY HASH (i1) PARTITIONS 100 */
The table is partitioned into 100 partitions, and there are 100 distinct values for i1. This means that all the values for a particular i1 are housed in a single partition.Consider the following query:
select price*qty from fact where i1 in (1,2,3);
This query is semantically equivalent to:
select price*qty from fact where i1 = 1
UNION ALL
select price*qty from fact where i1 = 2
UNION ALL
select price*qty from fact where i1 = 3
Unfortunately, MySQL does not have any intra-query parallelism, so rewriting the query that way is not an effective scaling strategy. However, if you execute all three queries at the same time, and use a temporary table as the UNION ALL, you can actually get parallelism. This is what Shard-Query does. It can take each IN list item and assign it to a worker, and stuff the results back together at the end.The second thing that can be done to improve performance at the partition level (or the shard level) is to push down aggregation of distributable aggregate functions to the worker. If you've read my blog before you might know that the distributable aggregate functions are SUM and COUNT.Consider a query very much like the previous query:
select shard_col, sum(price * qty) from t1 where shard_col in (1,2,3) group by shard_col;
This query features aggregation with distributable functions. This query is semantically equivalent to the following:This query is semantically equivalent to:
SELCT shard_col, SUM(`sum(price*qty)`) as `sum(price*qty)`
from ( select shard_col, sum(price*qty) from t1 where shard_col = 1 group by shard_col
UNION ALL
select shard_col, sum(price*qty) from t1 where shard_col = 2 group by shard_col
UNION ALL
select shard_col, sum(price*qty) from t1 where shard_col = 3 group by shard_col
) GROUP BY shard_col;
Shard-Query can push the aggregation down to the shards and it sends each query which is part of the "UNION ALL" operation in parallel.I ran a benchmark based on the above table with 40M rows in it. That is 400K rows per shard. The benchmark queries follow the pattern:
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2,3) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2,3,4) group by i1;
...
All the way up to 100 values in the IN list. The table contains 400K rows for each i1 value.Here is that graph again. For this test I used an EC2 "c1.large" instance. That is, 8 cores with 8 GB of memory. I used a 4GB data set, Percona Server 10.2 and a 1GB buffer pool size. Each partition is approximately 32MB in size.Since the machine has eight cores, I started eight Gearman workers. The results are, I think, impressive. In the graph, "partitions scanned" is the number of values in the IN list.This performs very well due to the pushdown of the aggregation. If a non-distributable aggregate function were to be used, then performance would probably be worse than MySQL because all 40M rows would be accessed and copied into a temporary table. However, it can add a lot of parallelism to queries which scan multiple partitions but return fewer rows, that is, queries with a more restrictive where clause.
This is the simple mysql benchmark script:
[root@localhost benchmarks]# cat inlist_test.php
$conn = mysql_connect() or die(mysql_error());
$fh = fopen('inlist.sql', 'r');
while($line = fgets($fh)) {
$start=microtime(true);
$stmt = mysql_query($line, $conn);
while($row = mysql_fetch_assoc($stmt)) {
}
mysql_free_result($stmt);
echo "WALLTIME::" . (microtime(true) - $start) . "s\n";
}
[root@localhost benchmarks]# head inlist.sql -n4
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2,3) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2,3,4) group by i1;
The Shard query times were captured with:
cat inlist.sql | ./shard_query.php --ini=parallel_query.ini --one --inlist=* --pushdown|grep WALL
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 25 May 2010 05:50:21 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:6:{i:0;a:5:{s:4:"data";s:9:"benchmark";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:11:"parallelism";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:8:"sharding";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:11:"shard-query";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:11:"performance";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:7116:"*edit* I want to point out that this test was done on a single database server which used MySQL partitioning. This is a demonstration of how Shard-Query can improve performance in non-sharded databases too.*edit*.
Over the weekend I spent a lot of time improving my new Shard-Query tool (code.google.com/p/shard-query) and the improvements can equate to big performance gains on partitioned data sets versus executing the query directly on MySQL.

I'll explain this graph below, but lower is better (response time) and Shard-Query is the red line.
MySQL understands that queries which access data in only certain partitions don't have to read the rest of the table. This partition elimination works well, but MySQL left a big optimization out of partitioning: getting data in parallel.
In fact, since partition elimination is the only major optimization provided by the partition options it isn't great for scaling access to large data sets when the entire data set must be accessed, but only when smaller parts of a the set are examined.
Since Shard-Query exploits parallelism with Gearman (http://www.gearman.org) I decided to extend the Shard-Query "optimizer" to support running queries with IN lists in parallel. This makes a query scale much further than it would if there was no parallelism at work.
Consider the table following partitioned fact table:
CREATE TABLE `fact` (
`id` bigint(20) unsigned DEFAULT NULL,
`a_id` bigint(20) unsigned DEFAULT NULL,
`b_id` int(11) NOT NULL,
`c_id` int(11) NOT NULL,
`i1` tinyint(4) DEFAULT NULL,
`qty` smallint(6) DEFAULT NULL,
`score` decimal(10,10) DEFAULT NULL,
`price` decimal(7,3) DEFAULT NULL,
`i2` int(11) DEFAULT NULL,
`i3` int(11) DEFAULT NULL,
`wide_row` char(54) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1
/*!50100 PARTITION BY HASH (i1) PARTITIONS 100 */
The table is partitioned into 100 partitions, and there are 100 distinct values for i1. This means that all the values for a particular i1 are housed in a single partition.
Consider the following query:
select price*qty from fact where i1 in (1,2,3);
This query is semantically equivalent to:
select price*qty from fact where i1 = 1
UNION ALL
select price*qty from fact where i1 = 2
UNION ALL
select price*qty from fact where i1 = 3
Unfortunately, MySQL does not have any intra-query parallelism, so rewriting the query that way is not an effective scaling strategy. However, if you execute all three queries at the same time, and use a temporary table as the UNION ALL, you can actually get parallelism. This is what Shard-Query does. It can take each IN list item and assign it to a worker, and stuff the results back together at the end.
The second thing that can be done to improve performance at the partition level (or the shard level) is to push down aggregation of distributable aggregate functions to the worker. If you've read my blog before you might know that the distributable aggregate functions are SUM and COUNT.
Consider a query very much like the previous query:
select shard_col, sum(price * qty) from t1 where shard_col in (1,2,3) group by shard_col;
This query features aggregation with distributable functions. This query is semantically equivalent to the following:
This query is semantically equivalent to:
SELCT shard_col, SUM(`sum(price*qty)`) as `sum(price*qty)`
from ( select shard_col, sum(price*qty) from t1 where shard_col = 1 group by shard_col
UNION ALL
select shard_col, sum(price*qty) from t1 where shard_col = 2 group by shard_col
UNION ALL
select shard_col, sum(price*qty) from t1 where shard_col = 3 group by shard_col
) GROUP BY shard_col;
Shard-Query can push the aggregation down to the shards and it sends each query which is part of the "UNION ALL" operation in parallel.
I ran a benchmark based on the above table with 40M rows in it. That is 400K rows per shard.
The benchmark queries follow the pattern:
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2,3) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2,3,4) group by i1;
...
All the way up to 100 values in the IN list. The table contains 400K rows for each i1 value.
Here is that graph again. For this test I used an EC2 "c1.large" instance. That is, 8 cores with 8 GB of memory. I used a 4GB data set, Percona Server 10.2 and a 1GB buffer pool size. Each partition is approximately 32MB in size.
Since the machine has eight cores, I started eight Gearman workers. The results are, I think, impressive. In the graph, "partitions scanned" is the number of values in the IN list.

This performs very well due to the pushdown of the aggregation. If a non-distributable aggregate function were to be used, then performance would probably be worse than MySQL because all 40M rows would be accessed and copied into a temporary table.
However, it can add a lot of parallelism to queries which scan multiple partitions but return fewer rows, that is, queries with a more restrictive where clause.
This is the simple mysql benchmark script:
[root@localhost benchmarks]# cat inlist_test.php
$conn = mysql_connect() or die(mysql_error());
$fh = fopen('inlist.sql', 'r');
while($line = fgets($fh)) {
$start=microtime(true);
$stmt = mysql_query($line, $conn);
while($row = mysql_fetch_assoc($stmt)) {
}
mysql_free_result($stmt);
echo "WALLTIME::" . (microtime(true) - $start) . "s\n";
}
[root@localhost benchmarks]# head inlist.sql -n4
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2,3) group by i1;
select i1,sum(qty*price) from SAB_SF1.fact where i1 in(1,2,3,4) group by i1;
The Shard query times were captured with:
cat inlist.sql | ./shard_query.php --ini=parallel_query.ini --one --inlist=* --pushdown|grep WALL
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:15:"Justin Swanhart";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:48;a:6:{s:4:"data";s:73:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:33:"Tuning InnoDB Concurrency Tickets";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:43:"http://www.mysqlperformanceblog.com/?p=2728";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:81:"http://www.mysqlperformanceblog.com/2010/05/24/tuning-innodb-concurrency-tickets/";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:9474:"InnoDB has an oft-unused parameter innodb_concurrency_tickets that seems widely misunderstood. From the docs: "The number of threads that can enter InnoDB concurrently is determined by the innodb_thread_concurrency variable. A thread is placed in a queue when it tries to enter InnoDB if the number of threads has already reached the concurrency limit. When a thread is allowed to enter InnoDB, it is given a number of “free tickets” equal to the value of innodb_concurrency_tickets, and the thread can enter and leave InnoDB freely until it has used up its tickets. After that point, the thread again becomes subject to the concurrency check (and possible queuing) the next time it tries to enter InnoDB. The default value is 500..."
What this means from a practical perspective is that each query is allocated 500 tickets when it begins executing. Each time it enters InnoDB, this number is decremented until it reaches zero ("entering InnoDB" appears only to occur when a row is accessed). When it reaches zero, it may-or-may-not be put into a queue and wait to continue execution. InnoDB doesn't provide us a way in which to determine how many concurrency tickets a query uses, making this parameter notoriously difficult to tune. It is important to note that this variable only comes in to play when innodb_thread_concurrency is greater than zero.
On a stock install of MySQL, here are some example queries and the corresponding number of concurrency tickets used for each:
PLAIN TEXT
SQL:
mysql> CREATE TABLE test_table (
-> id int
-> ) ENGINE=InnoDB; -- 0 Tickets Used
Query OK, 0 rows affected (0.36 sec)
mysql> INSERT INTO test_table (id) VALUES (1); -- 0 Tickets Used
Query OK, 1 row affected (0.00 sec)
mysql> SELECT * FROM test_table; -- 1 Ticket Used
+------+
| id |
+------+
| 1 |
+------+
1 row IN SET (0.00 sec)
mysql> INSERT INTO test_table (id) VALUES (2),(3); -- 0 Tickets Used
Query OK, 2 rows affected (0.00 sec)
Records: 2 Duplicates: 0 Warnings: 0
mysql> SELECT COUNT(*) FROM test_table; -- 3 Tickets Used
+----------+
| COUNT(*) |
+----------+
| 3 |
+----------+
1 row IN SET (0.00 sec)
mysql> UPDATE test_table SET id=4 WHERE id=1; -- 4 Tickets Used (because no index, a table scan is performed)
Query OK, 1 row affected (0.00 sec)
Rows matched: 1 Changed: 1 Warnings: 0
mysql> ALTER TABLE test_table ADD INDEX (id); -- 5 Tickets Used
Query OK, 3 rows affected (0.01 sec)
Records: 3 Duplicates: 0 Warnings: 0
And now on to a more interesting scenario: foreign keys
PLAIN TEXT
SQL:
mysql> CREATE TABLE parent (id INT NOT NULL,
-> PRIMARY KEY (id)
-> ) ENGINE=INNODB; -- 0 Tickets Used
Query OK, 0 rows affected (0.01 sec)
mysql> CREATE TABLE child (id INT, parent_id INT,
-> INDEX par_ind (parent_id),
-> FOREIGN KEY (parent_id) REFERENCES parent(id)
-> ON DELETE CASCADE
-> ) ENGINE=INNODB; -- 0 Tickets Used
Query OK, 0 rows affected (0.00 sec)
mysql> INSERT INTO parent (id) VALUES (1),(2),(3),(4); -- 3 Tickets Used
Query OK, 4 rows affected (0.03 sec)
mysql> INSERT INTO child (id, parent_id) VALUES (1,1),(1,1),(2,1); -- 2 Tickets Used
Query OK, 3 rows affected (0.00 sec)
Records: 3 Duplicates: 0 Warnings: 0
mysql> DELETE FROM child WHERE 1; -- 6 Tickets Used
Query OK, 3 rows affected (0.02 sec)
mysql> ALTER TABLE `child` ADD PRIMARY KEY (`id`,`parent_id`); -- 0 Tickets Used
Query OK, 0 rows affected (0.02 sec)
Records: 0 Duplicates: 0 Warnings: 0
mysql> INSERT INTO `child` (`id`,`parent_id`) VALUES (1,1), (1,2), (2,1),(2,2); -- 3 Tickets Used
Query OK, 4 rows affected (0.01 sec)
Records: 4 Duplicates: 0 Warnings: 0
So, how can we put this into practice, since this information isn't available to most users?
INSERT w/PRIMARY KEY defined: Number of rows inserted - 1
INSERT w/FOREIGN KEY constraint: Number of rows inserted - 1
SELECT: 1 ticket per row returned
UPDATE: 1 ticket per row examined + 1 ticket per row updated
DELETE: 1 ticket per row examined + 1 ticket per row deleted
ALTER: (2 * rows in the table) - 1
As with any performance optimization effort, you will want to optimize for the common case. If you have a very simple workload, you can calculate these values by hand. But for most workloads with a complex access pattern, we'll need to estimate or wait for InnoDB to expose this information to us.
What happens in the case where I have two distinct access patterns: single row primary-key lookups and SELECT statements that examine 900 rows? If innodb_concurrency_tickets is set to 500, then all of the single row PK lookups will execute without ever being subject to an additional concurrency check (there is always one when a thread first enters InnoDB) while the 900-row SELECT statements will always be subject to one additional concurrency check (we actually care less about the concurrency check itself than the possibility that it may become queued). Your first instinct may be to increase innodb_concurrency_tickets to >=900 in this case, but that isn't necessarily the best decision. As stated in the docs, the number of threads that can enter InnoDB is limited by innodb_thread_concurrency (which is why these two variables are most often tuned in concert). To continue the example, if innodb_thread_concurrency is set to 8 and eight 900-row-SELECT statements come in, they will effectively block the PK lookups until one of them is subject to a concurrency check or complete execution and exit InnoDB. If innodb_concurrency_tickets had been increased to >= 900, then ALL of the PK lookups would be blocked until the 900-row-SELECT statements complete execution.
With a maximum value of 4,294,967,295 this has the potential to block other queries for a significant amount of time. Setting innodb_concurrency_tickets too high can have startlingly negative performance implications. On the other hand, if we determine that 99% of the traffic are these single row PK lookups and only 1% are the 900-row SELECTs, we may be tempted to lower the setting to 1 to accommodate the "typical case". The effects of this, though, would be to cause the 900-row SELECT statements to be subject to 899 concurrency checks. This means 899 potential opportunities to be queued! So, as with most other parameters, this is a balancing act.
It really comes down to the importance of the applicable queries. Imagine those 900-row SELECT statements were actually 10,000 row selects, this would become a more pressing issue. If they are reporting queries used only internally, then it is not so much of an issue and you can leave innodb_concurrency_tickets rather small. If, on the other hand, these are the queries that lead to revenue generation, you may want to give them a bit more dedicated CPU time so they execute that much faster (even at the expense of the PK lookups). In other words, if you're optimizing for throughput in this scenario, you will tune innodb_concurrency_tickets to the 99th percentile of small PK lookups. If you're optimizing for response time, you would set it larger to accommodate the larger (important) select statements.
A quick sysbench run gives us the following results (X-axis is innodb_concurrency_tickets, Y-axis is txn/sec. More is better). Since all sysbench queries are 10 rows or less, we don't really expect to see much of a difference here:
Details:
PLAIN TEXT
CODE:
sysbench --test=oltp --oltp-table-size=80000000 --oltp-read-only=off --init-rng=on --num-threads=16 --max-requests=0 --oltp-dist-type=uniform --max-time=300 --mysql-user=root --mysql-socket=/var/lib/mysql/mysql.sock run
Applicable my.cnf settings:
PLAIN TEXT
CODE:
innodb_buffer_pool_size=24G
innodb_data_file_path=ibdata1:10M:autoextend
innodb_file_per_table=1
innodb_flush_log_at_trx_commit = 1
innodb_log_buffer_size = 8M
innodb_log_files_in_group=2
innodb_log_file_size=1900M
innodb_thread_concurrency=16
innodb_flush_method = O_DIRECT
innodb_write_io_threads=8
innodb_read_io_threads=8
innodb_io_capacity=500
innodb_max_dirty_pages_pct=90
max_connections=3000
query_cache_size=0
skip-name-resolve
table_cache=10000
Entry posted by Ryan Lowe |
2 comments
Add to: | | | | ";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 25 May 2010 03:41:20 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:7:{i:0;a:5:{s:4:"data";s:6:"Innodb";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:10:"benchmarks";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:6:"tuning";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:19:"Concurrency Tickets";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:5;a:5:{s:4:"data";s:18:"InnoDB Concurrency";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:6;a:5:{s:4:"data";s:26:"innodb_concurrency_tickets";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:17674:"InnoDB has an oft-unused parameter innodb_concurrency_tickets that seems widely misunderstood. From the docs: "The number of threads that can enter InnoDB concurrently is determined by the innodb_thread_concurrency variable. A thread is placed in a queue when it tries to enter InnoDB if the number of threads has already reached the concurrency limit. When a thread is allowed to enter InnoDB, it is given a number of “free tickets” equal to the value of innodb_concurrency_tickets, and the thread can enter and leave InnoDB freely until it has used up its tickets. After that point, the thread again becomes subject to the concurrency check (and possible queuing) the next time it tries to enter InnoDB. The default value is 500..."
What this means from a practical perspective is that each query is allocated 500 tickets when it begins executing. Each time it enters InnoDB, this number is decremented until it reaches zero ("entering InnoDB" appears only to occur when a row is accessed). When it reaches zero, it may-or-may-not be put into a queue and wait to continue execution. InnoDB doesn't provide us a way in which to determine how many concurrency tickets a query uses, making this parameter notoriously difficult to tune. It is important to note that this variable only comes in to play when innodb_thread_concurrency is greater than zero.
On a stock install of MySQL, here are some example queries and the corresponding number of concurrency tickets used for each:
SQL:
-
mysql> CREATE TABLE test_table (
-
-> id int
-
-> ) ENGINE=InnoDB; -- 0 Tickets Used
-
Query OK, 0 rows affected (0.36 sec)
-
-
mysql> INSERT INTO test_table (id) VALUES (1); -- 0 Tickets Used
-
Query OK, 1 row affected (0.00 sec)
-
-
mysql> SELECT * FROM test_table; -- 1 Ticket Used
-
+------+
-
| id |
-
+------+
-
| 1 |
-
+------+
-
1 row IN SET (0.00 sec)
-
-
mysql> INSERT INTO test_table (id) VALUES (2),(3); -- 0 Tickets Used
-
Query OK, 2 rows affected (0.00 sec)
-
Records: 2 Duplicates: 0 Warnings: 0
-
-
mysql> SELECT COUNT(*) FROM test_table; -- 3 Tickets Used
-
+----------+
-
| COUNT(*) |
-
+----------+
-
| 3 |
-
+----------+
-
1 row IN SET (0.00 sec)
-
-
mysql> UPDATE test_table SET id=4 WHERE id=1; -- 4 Tickets Used (because no index, a table scan is performed)
-
Query OK, 1 row affected (0.00 sec)
-
Rows matched: 1 Changed: 1 Warnings: 0
-
-
mysql> ALTER TABLE test_table ADD INDEX (id); -- 5 Tickets Used
-
Query OK, 3 rows affected (0.01 sec)
-
Records: 3 Duplicates: 0 Warnings: 0
And now on to a more interesting scenario: foreign keys
SQL:
-
mysql> CREATE TABLE parent (id INT NOT NULL,
-
-> PRIMARY KEY (id)
-
-> ) ENGINE=INNODB; -- 0 Tickets Used
-
Query OK, 0 rows affected (0.01 sec)
-
-
mysql> CREATE TABLE child (id INT, parent_id INT,
-
-> INDEX par_ind (parent_id),
-
-> FOREIGN KEY (parent_id) REFERENCES parent(id)
-
-> ON DELETE CASCADE
-
-> ) ENGINE=INNODB; -- 0 Tickets Used
-
Query OK, 0 rows affected (0.00 sec)
-
-
mysql> INSERT INTO parent (id) VALUES (1),(2),(3),(4); -- 3 Tickets Used
-
Query OK, 4 rows affected (0.03 sec)
-
-
mysql> INSERT INTO child (id, parent_id) VALUES (1,1),(1,1),(2,1); -- 2 Tickets Used
-
Query OK, 3 rows affected (0.00 sec)
-
Records: 3 Duplicates: 0 Warnings: 0
-
-
mysql> DELETE FROM child WHERE 1; -- 6 Tickets Used
-
Query OK, 3 rows affected (0.02 sec)
-
-
mysql> ALTER TABLE `child` ADD PRIMARY KEY (`id`,`parent_id`); -- 0 Tickets Used
-
Query OK, 0 rows affected (0.02 sec)
-
Records: 0 Duplicates: 0 Warnings: 0
-
-
mysql> INSERT INTO `child` (`id`,`parent_id`) VALUES (1,1), (1,2), (2,1),(2,2); -- 3 Tickets Used
-
Query OK, 4 rows affected (0.01 sec)
-
Records: 4 Duplicates: 0 Warnings: 0
So, how can we put this into practice, since this information isn't available to most users?
INSERT w/PRIMARY KEY defined: Number of rows inserted - 1
INSERT w/FOREIGN KEY constraint: Number of rows inserted - 1
SELECT: 1 ticket per row returned
UPDATE: 1 ticket per row examined + 1 ticket per row updated
DELETE: 1 ticket per row examined + 1 ticket per row deleted
ALTER: (2 * rows in the table) - 1
As with any performance optimization effort, you will want to optimize for the common case. If you have a very simple workload, you can calculate these values by hand. But for most workloads with a complex access pattern, we'll need to estimate or wait for InnoDB to expose this information to us.
What happens in the case where I have two distinct access patterns: single row primary-key lookups and SELECT statements that examine 900 rows? If innodb_concurrency_tickets is set to 500, then all of the single row PK lookups will execute without ever being subject to an additional concurrency check (there is always one when a thread first enters InnoDB) while the 900-row SELECT statements will always be subject to one additional concurrency check (we actually care less about the concurrency check itself than the possibility that it may become queued). Your first instinct may be to increase innodb_concurrency_tickets to >=900 in this case, but that isn't necessarily the best decision. As stated in the docs, the number of threads that can enter InnoDB is limited by innodb_thread_concurrency (which is why these two variables are most often tuned in concert). To continue the example, if innodb_thread_concurrency is set to 8 and eight 900-row-SELECT statements come in, they will effectively block the PK lookups until one of them is subject to a concurrency check or complete execution and exit InnoDB. If innodb_concurrency_tickets had been increased to >= 900, then ALL of the PK lookups would be blocked until the 900-row-SELECT statements complete execution.
With a maximum value of 4,294,967,295 this has the potential to block other queries for a significant amount of time. Setting innodb_concurrency_tickets too high can have startlingly negative performance implications. On the other hand, if we determine that 99% of the traffic are these single row PK lookups and only 1% are the 900-row SELECTs, we may be tempted to lower the setting to 1 to accommodate the "typical case". The effects of this, though, would be to cause the 900-row SELECT statements to be subject to 899 concurrency checks. This means 899 potential opportunities to be queued! So, as with most other parameters, this is a balancing act.
It really comes down to the importance of the applicable queries. Imagine those 900-row SELECT statements were actually 10,000 row selects, this would become a more pressing issue. If they are reporting queries used only internally, then it is not so much of an issue and you can leave innodb_concurrency_tickets rather small. If, on the other hand, these are the queries that lead to revenue generation, you may want to give them a bit more dedicated CPU time so they execute that much faster (even at the expense of the PK lookups). In other words, if you're optimizing for throughput in this scenario, you will tune innodb_concurrency_tickets to the 99th percentile of small PK lookups. If you're optimizing for response time, you would set it larger to accommodate the larger (important) select statements.
A quick sysbench run gives us the following results (X-axis is innodb_concurrency_tickets, Y-axis is txn/sec. More is better). Since all sysbench queries are 10 rows or less, we don't really expect to see much of a difference here:

Details:
CODE:
-
sysbench --test=oltp --oltp-table-size=80000000 --oltp-read-only=off --init-rng=on --num-threads=16 --max-requests=0 --oltp-dist-type=uniform --max-time=300 --mysql-user=root --mysql-socket=/var/lib/mysql/mysql.sock run
Applicable my.cnf settings:
CODE:
-
innodb_buffer_pool_size=24G
-
innodb_data_file_path=ibdata1:10M:autoextend
-
innodb_file_per_table=1
-
innodb_flush_log_at_trx_commit = 1
-
innodb_log_buffer_size = 8M
-
innodb_log_files_in_group=2
-
innodb_log_file_size=1900M
-
innodb_thread_concurrency=16
-
innodb_flush_method = O_DIRECT
-
innodb_write_io_threads=8
-
innodb_read_io_threads=8
-
innodb_io_capacity=500
-
innodb_max_dirty_pages_pct=90
-
max_connections=3000
-
query_cache_size=0
-
skip-name-resolve
-
table_cache=10000
Entry posted by Ryan Lowe |
2 comments
Add to:
|
|
|
| 
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:9:"Ryan Lowe";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}i:49;a:6:{s:4:"data";s:63:"
";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";s:5:"child";a:3:{s:0:"";a:6:{s:5:"title";a:1:{i:0;a:5:{s:4:"data";s:56:"MySQL and Java - Free Webinar on Using MySQL Connector/J";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"guid";a:1:{i:0;a:5:{s:4:"data";s:66:"http://blogs.sun.com/theaquarium/entry/mysql_and_java_free_webinar";s:7:"attribs";a:1:{s:0:"";a:1:{s:11:"isPermaLink";s:5:"false";}}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:4:"link";a:1:{i:0;a:5:{s:4:"data";s:66:"http://blogs.sun.com/theaquarium/entry/mysql_and_java_free_webinar";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:11:"description";a:1:{i:0;a:5:{s:4:"data";s:734:"
Mark Matthews,
Todd Farmer and Rebecca Hansen
are giving a free webinar tomorrow entitled:
Better Java Application Scalability and Reliability Using MySQL Connector/J Features.
Mark is the original creator of MySQL Connector/J and author of the book MySQL and Java Developer's Guide, Todd is the manager for the America's Support team for MySQL and Rebecca is the Product Marketing Manager for MySQL.
You only need your browser to attend;
webinar is free but registration is
required.
The event is Tue May 25th
(tomorrow)
at 9am US PT
(Other TZs),
more details
here.
MySQL.com
also has an archive of Webinars available for replay.
For webinars with
Duke
and
Sakila
check
here
- and if you are partial to
elePHPant
check
here.";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:7:"pubDate";a:1:{i:0;a:5:{s:4:"data";s:31:"Tue, 25 May 2010 03:30:12 +0000";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}s:8:"category";a:5:{i:0;a:5:{s:4:"data";s:5:"MySQL";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:1;a:5:{s:4:"data";s:4:"java";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:2;a:5:{s:4:"data";s:4:"jdbc";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:3;a:5:{s:4:"data";s:5:"mysql";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}i:4;a:5:{s:4:"data";s:7:"webinar";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:40:"http://purl.org/rss/1.0/modules/content/";a:1:{s:7:"encoded";a:1:{i:0;a:5:{s:4:"data";s:2423:"
MySQL.com
also has an archive of Webinars available for replay.
For webinars with
Duke
and
Sakila
check
here
- and if you are partial to
elePHPant
check
here.
PlanetMySQL Voting:
Vote UP /
Vote DOWN";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}s:32:"http://purl.org/dc/elements/1.1/";a:1:{s:7:"creator";a:1:{i:0;a:5:{s:4:"data";s:12:"The Aquarium";s:7:"attribs";a:0:{}s:8:"xml_base";s:0:"";s:17:"xml_base_explicit";b:0;s:8:"xml_lang";s:0:"";}}}}}}}}}}}}}}}}s:4:"type";i:128;s:7:"headers";a:7:{s:4:"date";s:29:"Wed, 02 Jun 2010 07:02:37 GMT";s:6:"server";s:22:"Apache/2.2.13 (Fedora)";s:13:"last-modified";s:29:"Wed, 02 Jun 2010 07:01:18 GMT";s:13:"accept-ranges";s:5:"bytes";s:14:"content-length";s:6:"436428";s:10:"connection";s:5:"close";s:12:"content-type";s:8:"text/xml";}s:5:"build";s:14:"20090627192103";}