Skip to content

Semgrex DSL

Simon Gray edited this page Oct 24, 2017 · 1 revision

Experimental Clojure DSL for Semgrex

;; identifiers for nodes are marked in this way
{:text "xyz", :pos "NN", :id 3}



;; alternative syntax
[{} :< :nsubj {}]
[{} :<nsubj {}]
[{} :< {}]
[:< :nsubj {} {}]
(< (! {}) {})

;; identifiers for relations are set in this way (:? is the identifier)
(se [x {:lemma "eat"}
     y :?]
    (< y {} x))





;; naming of relations and nodes is done inside a vector preceding the pattern definition
;; as a side-effect, the pattern definitions are also simplified
;; so it might even make sense in cases where the named nodes need to be accessed
(se-pattern [x-rel :unknown
             see-node {:lemma "see"}
             eat-node {:lemma "eat"}]
            (> x-rel {} see-node eat-node))
;; {} >=x-rel {lemma:see}=see-node >=x-rel {lemma:eat}=eat-node
;; hmm, maybe not good as x-rel appears twice using (> ...)


(se-pattern [eat {:lemma "eat"}
             rel #"nsubj|dobj"
             x {}]
            "{eat} >rel {x}")


;; base relation function
(relation :dep :nsubj {} {} {})

;; the subject of the phrase (dep of nsubj rel)
(< :nsubj {} {} {})

;; the root of the phrase (gov of nsubj rel)
(> :nsubj {} {} {})
(> :nsubj {} {} {})

;; any dep relation
(< {} {} {})

;; number of nodes can vary (must have at least 2)
(> {} {})
(< {} {} {} {} {} {})

;; nodes can be strings (allow for output of other functions)
(< "!{lemma:have}" {})
(< (not {:lemma "have"}) {})

;; also works with nesting
(< (not (or {:lemma "have"
              {:lemma "eat"}})
     {}))
(< "!({lemma:have}|{lemma:eat})" {})

;; use vectors to group
(> [{:lemma "thing"} {:lemma "thingy"}] {})

;; optional relations
(?> {} {})
(?< {} {})
Clone this wiki locally