using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Xml; using newtelligence.DasBlog.Runtime; using System.Text.RegularExpressions; internal static class BloggerToDasBlog { private enum TypeOfEntry { Setting, Post, Comment, None } private const string ATOM_XML_NAMESPACE = "http://www.w3.org/2005/Atom"; private const string PURL_XML_NAMESPACE = "http://purl.org/syndication/thread/1.0"; private const string CAT_SCHEMA = "http://schemas.google.com/blogger/"; private const int POSITION_OF_ID = 39; private const int POSITION_OF_ENTRY_TYPE = 44; private const int INPUT_BLOG_POSTS = 1000; private const int EXPECTED_BLOG_REDIRECTS = 300; private const int INPUT_BLOG_COMMENTS = 1000; internal static string SUBFOLDER = String.Empty; internal static string MYBLOGNAME = String.Empty; internal static string MYEMAIL = String.Empty; internal static string MYBLOGURL = String.Empty; internal static string DESTINATION = String.Empty; internal static string XML_REWRITEMAP = String.Empty; internal static string XML_PATH = String.Empty; internal static string REGEX_post_redirect = String.Empty; internal static string REGEX_category_redirect = String.Empty; internal static string REGEX_monthlyarchive_redirect = String.Empty; public static void Run() { Debug.Assert(MYBLOGNAME.Length > 0, "Must provide blog name."); Debug.Assert(MYBLOGURL.Length > 0, "Must provide blog URL."); Debug.Assert(DESTINATION.Length > 0, "Must provide folder path for output XML files."); Debug.Assert(XML_REWRITEMAP.Length > 0, "Must provide file path for rewrite XML."); Debug.Assert(XML_PATH.Length > 0, "Must provide folder path for input XML file(s)."); List posts = new List(INPUT_BLOG_POSTS); List comments = new List(INPUT_BLOG_COMMENTS); Dictionary redirects = new Dictionary(EXPECTED_BLOG_REDIRECTS); Debug.WriteLine("Processing data from XML file(s): "); Debug.Indent(); foreach (string file in Directory.GetFiles(XML_PATH, "*.xml", SearchOption.TopDirectoryOnly)) { BloggerToDasBlog.ProcessXmlFile(posts, comments, redirects, file); } Debug.Unindent(); Debug.WriteLine("\r\n Updating post content with new outgoing URLs"); posts.ForEach(e => { e.Content = BloggerToDasBlog.UpdateUrlsInBody(e.Content, redirects); }); Debug.WriteLine("\r\n Writing rewriteMaps elements at: " + BloggerToDasBlog.XML_REWRITEMAP); BloggerToDasBlog.WriteRewriteMaps(redirects); Debug.WriteLine("\r\n Creating dasBlog content at: " + BloggerToDasBlog.DESTINATION); BloggerToDasBlog.CreateDasBlogContent(posts, comments); Debug.WriteLine("Done"); } #region ProcessXmlFile private static void ProcessXmlFile(List posts, List comments, Dictionary redirects, string file) { Debug.WriteLine(file); XmlDocument doc = new XmlDocument(); try { doc.Load(file); } catch (XmlException e) { Debug.WriteLine(string.Format("XML is invalid in file {0} at location {1}, {2} ", file, e.LineNumber, e.LinePosition)); Debug.WriteLine("Go back to blogger.com and fix the blog post the XML refers to. Most likley a mismatched tag that your browser was too forgiving with."); throw; } XmlNamespaceManager namespaceMgr; namespaceMgr = new XmlNamespaceManager(doc.NameTable); namespaceMgr.AddNamespace("def", ATOM_XML_NAMESPACE); namespaceMgr.AddNamespace("thr", PURL_XML_NAMESPACE); XmlNodeList entries = doc.SelectNodes(@"//def:entry", namespaceMgr); foreach (XmlElement b in entries) { TypeOfEntry toe; string cats = BloggerToDasBlog.GetCategoriesAndTypeOfEntry(b, out toe, namespaceMgr, redirects); if (toe == TypeOfEntry.Post) { Entry e = BloggerToDasBlog.GetPostFromXml(b, cats); BloggerToDasBlog.EliminateDuplicatePostTitles(posts, e); BloggerToDasBlog.DealWithDifferentTitleGenerationInBlogEngines(redirects, b, e, namespaceMgr); posts.Add(e); } else if (toe == TypeOfEntry.Comment) { XmlNodeList thrInReplyTo = b.GetElementsByTagName("in-reply-to", PURL_XML_NAMESPACE); if (thrInReplyTo.Count == 0) { Debug.WriteLine("skip comment, it is not attached to anything: " + b["title"].InnerText); continue; } string targetEntryId = thrInReplyTo[0].Attributes["ref"].Value.Substring(POSITION_OF_ID); Comment c = BloggerToDasBlog.GetCommentFromXml(b, targetEntryId); comments.Add(c); } else { Debug.Assert(toe == TypeOfEntry.Setting, b["id"].InnerText); continue; } } } private static string GetCategoriesAndTypeOfEntry(XmlElement b, out TypeOfEntry entryType, XmlNamespaceManager namespaceMgr, Dictionary redirects) { entryType = TypeOfEntry.None; string result = string.Empty; XmlNodeList terms = b.SelectNodes(@"def:category/@term", namespaceMgr); foreach (XmlAttribute item in terms) { string catTerm = item.InnerText; if (!catTerm.StartsWith(CAT_SCHEMA)) { result += catTerm + ";"; } else { switch (catTerm.Substring(POSITION_OF_ENTRY_TYPE)) { case "comment": entryType = TypeOfEntry.Comment; break; case "post": entryType = TypeOfEntry.Post; break; case "settings": case "template": entryType = TypeOfEntry.Setting; break; default: Debug.Assert(false, catTerm); entryType = TypeOfEntry.Setting; break; } } } Debug.Assert(entryType != TypeOfEntry.Post || result.Length > 0, b["title"].InnerText); // If the entry type is None, that means we are dealing with multiple XML files instead of just one. // This code copes with both types of XML files if (entryType == TypeOfEntry.None) { if (result.Length > 0) { entryType = TypeOfEntry.Post; } XmlNodeList links = b.SelectNodes(@"def:link/@href", namespaceMgr); Debug.Assert(links.Count >= 3, b["title"].InnerText); if (links.Count > 3 || links[0].Value.Contains("posts")) { entryType = TypeOfEntry.Post; } else { Debug.Assert(links[0].Value.Contains("comments"), links[0].Value); entryType = TypeOfEntry.Comment; } } return result; } private static Entry GetPostFromXml(XmlElement b, string cats) { Entry e = new Entry(); e.Categories = cats; e.Title = b["title"].InnerText; e.Author = MYBLOGNAME; e.IsPublic = true; e.ShowOnFrontPage = true; e.AllowComments = true; e.Syndicated = true; e.EntryId = BloggerToDasBlog.GetEntryId(b); e.Content = BloggerToDasBlog.GetContent(b); e.CreatedLocalTime = BloggerToDasBlog.GetCreatedDate(b); e.ModifiedLocalTime = BloggerToDasBlog.GetModifiedDate(b); return e; } private static void EliminateDuplicatePostTitles(List posts, Entry e) { if (posts.Exists((unique => unique.CompressedTitle == e.CompressedTitle))) { Debug.WriteLine("Found duplicate: " + e.Title); e.Title = "[again] " + e.Title; // trying to fix duplicate (does not deal with triplicates) } } private static void DealWithDifferentTitleGenerationInBlogEngines(Dictionary redirects, XmlElement b, Entry e, XmlNamespaceManager namespaceMgr) { string bloggerPerma = b.SelectSingleNode(@"def:link[@rel='alternate']/@href", namespaceMgr).InnerText; string titleOnly = bloggerPerma.Remove(bloggerPerma.LastIndexOf('.')).Substring(bloggerPerma.LastIndexOf('/') + 1); string dasBlogTitle = e.CompressedTitle.Replace('+', '-'); if (String.Compare(titleOnly, dasBlogTitle, true) != 0) { redirects.Add(bloggerPerma.Substring(BloggerToDasBlog.MYBLOGURL.Length - BloggerToDasBlog.SUBFOLDER.Length), BloggerToDasBlog.SUBFOLDER + dasBlogTitle + ".aspx"); Debug.WriteLine(string.Format("Will redirect: \r\n \t {0} \r\n \t {1}", titleOnly, dasBlogTitle.ToLower())); } } private static string GetEntryId(XmlElement b) { string ret = b["id"].InnerText; Debug.Assert(ret.StartsWith(@"tag:blogger.com,"), ret); return ret.Substring(POSITION_OF_ID); } private static DateTime GetModifiedDate(XmlElement b) { return DateTime.Parse(b["updated"].InnerText); } private static DateTime GetCreatedDate(XmlElement b) { return DateTime.Parse(b["published"].InnerText); } private static Comment GetCommentFromXml(XmlElement b, string teid) { Comment c = new Comment(); c.TargetEntryId = teid; c.IsPublic = true; c.SpamState = SpamState.NotSpam; c.EntryId = BloggerToDasBlog.GetEntryId(b); c.Content = BloggerToDasBlog.GetContent(b); c.CreatedLocalTime = BloggerToDasBlog.GetCreatedDate(b); c.ModifiedLocalTime = BloggerToDasBlog.GetModifiedDate(b); BloggerToDasBlog.PopulateAuthor(b, c); return c; } private static void PopulateAuthor(XmlElement b, Comment c) { XmlElement author = (XmlElement)b.GetElementsByTagName("author")[0]; Debug.Assert(author.ChildNodes.Count <= 4, author["name"].InnerText); XmlElement temp; temp = author["name"]; if (temp != null) c.Author = temp.InnerText; if (c.Author == MYBLOGNAME) { c.AuthorEmail = MYEMAIL; c.AuthorHomepage = MYBLOGURL; return; } temp = author["uri"]; if (temp != null) c.AuthorHomepage = temp.InnerText; temp = author["email"]; if (temp != null && temp.InnerText != "noreply@blogger.com") c.AuthorEmail = temp.InnerText; } private static string GetContent(XmlElement b) { string result = b["content"].InnerText; int trimEnd = result.IndexOf("