Version 3 of TAX RSS

Updated 2006-11-15 16:27:26

I needed a simple, easy to work with, asyncronous safe RSS parser. I tried TclRSS [L1 ], but it had heavy requirements, and I never actually figured out the correct combination of dependencies and their versions needed to make it work. I gave up on that and started writing my own RSS parser, using the subset of the TclRSS API that I had already written a program to. I found TAX: A Tiny API for XML (make sure you use the version at the bottom of the page, I just made that into a package and called it "tax 0.1") a small, good-enough Tcl XML parser and wrote an RSS parser from example RSS 2.0 feeds, and not from the specification (I was in a hurry). It works with my limited testing (digg.com, sourceforge, and cnn), but probably doesn't work for anything that is not exactly RSS 2.0 and pretty close to one of my examples.

 #! /usr/bin/env tclsh

 package require tax

 namespace eval rss {
        namespace eval channels {
        }
        namespace eval items {
        }
 }

 proc rss::__replace_entities {text} {
        return [string map [list "&nbsp;" " " "&gt;" ">" "&lt;" "<" "&amp;" "&"] $text]
 }

 proc rss::__strip_html {text} {
        # We replace entities here (i.e., twice) because HTML-inside-XML will have
        # the HTML entities escaped twice.
        return [__replace_entities [regsub -all -- {<[^>]*>} $text ""]]
 }

 proc rss::__tax_add_to_object {obj tag isClose isSelfClosing properties body} {
        upvar #0 $obj rssobj
        set channelid [namespace tail $obj]

        set tag [string tolower [string trim $tag]]
        if {$tag == "docstart"} {
                set rssobj(parent) [list] 

                namespace eval ::rss::items::$channelid {}
        }
        if {$tag == "docstart" || $tag == "rss"} {
                return
        }
        if {[string index $tag 0] == "?"} {
                return
        }

        if {$isClose && !$isSelfClosing} {
                if {$tag == "item"} {
                        # We close tag items twice, because we add a fake open with the tag id
                        set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                }
                set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                return
        }

        lappend rssobj(parent) $tag

        set parent [lindex $rssobj(parent) end-1]
        switch -- $tag {
                "item" {
                        if {$parent == "channel"} {
                                set lastusedid [namespace tail [lindex [lsort -dictionary -decreasing [info procs ::rss::items::${channelid}::*]] 0]]
                                if {$lastusedid == ""} {
                                        set lastusedid 0
                                }
                                set id "::rss::items::${channelid}::[expr $lastusedid + 1]"

                                proc $id [list command [list obj $obj] [list id $id]] {
                                        upvar #0 $obj rssobj
                                        switch -- [string tolower $command] {
                                                "title" {
                                                        set idx [list $id title]
                                                }
                                                "link" {
                                                        set idx [list $id link]
                                                }
                                                "description" {
                                                        set idx [list $id description]
                                                }
                                                "date" {
                                                        set idx [list $id pubdate]
                                                }
                                        }

                                        if {![info exists idx]} {
                                                return ""
                                        }
                                        if {![info exists rssobj($idx)]} {
                                                return ""
                                        }

                                        return $rssobj($idx)
                                }

                                lappend rssobj(items) $id

                                lappend rssobj(parent) $id
                        }
                }
                "title" {
                        set rssobj([list $parent title]) [__strip_html [__replace_entities $body]]
                }
                "link" {
                        set rssobj([list $parent link]) [__replace_entities $body]
                }
                "description" {
                        set rssobj([list $parent description]) [__strip_html [__replace_entities $body]]
                }
                "pubdate" {
                        catch {
                                set body [clock scan $body]
                        }
                        set rssobj([list $parent pubdate]) $body
                }
        }

        if {$isClose} {
                # For self-closing tags
                if {$tag == "item"} {
                        # We close tag items twice, because we add a fake open with the tag id
                        set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
                }
                set rssobj(parent) [lrange $rssobj(parent) 0 end-1]
        }

        return
 }

 # Return ID
 proc rss::parse {data} {
        set lastusedid [namespace tail [lindex [lsort -dictionary -decreasing [info vars ::rss::channels::*]] 0]]
        if {$lastusedid == ""} {
                set lastusedid 0
        }
        set id "::rss::channels::[expr $lastusedid + 1]"

        upvar #0 $id rssobj

        tax::parse [list ::rss::__tax_add_to_object $id] $data

        proc $id [list command [list obj $id]] {
                upvar #0 $obj rssobj
                switch -- $command {
                        "items" {
                                set idx items
                        }
                        "description" {
                                set idx [list channel description]
                        }
                        "link" {
                                set idx [list channel link]
                        }
                }

                if {![info exists idx]} {
                        return ""
                }
                if {![info exists rssobj($idx)]} {
                        return ""
                }
                return $rssobj($idx)
        }

        return $id
 }

 proc rss::cleanup {id} {
        if {[string match "::rss::channels::*" $id]} {
                set channelid [namespace tail $id]
                foreach proc [info procs ::rss::items::${channelid}::*] {
                        rename $proc ""
                }
                unset -nocomplain $id
        }

        return 1
 }

 package provide rss 0.1

You'll notice that it is slightly inconsisent towards the middle.. I should have used a namespace under ::rss for both channels and items, but I didn't realize this until I didn't feel like changing it. Feel free to edit the above, or use it in your own code.


LV Do you have any examples to demonstrate?


Category Internet