# A RELAX NG compact syntax pattern for text augmented by # linguistic preprocessing # The namespace used: namespace aug = "http://www.purl.org/net/ties/schema/augment" # The start element. Augmented text contains any number of suitable subelements. # It can be embedded within any XML elements from outer namespaces. start = ( AugmentedText | OtherOutsideContent ) AugmentedText = element aug:augment { TopLevelContent* } # The content (allowed subelements) of elements: # At the outmost level, any elements are allowed (the sentence element is # optional). TopLevelContent = ( Sentence | InlineContent | OtherContent ) # Any elements except sentences are allowed as inline content. Sentences can # directly contain POS elements, e.g. punctuation; constituents can recursively # contain themselves. InlineContent = ( Constituent | POS | OtherContent ) # Other Elements from this namespace (in alphabetic order): # A sentence constituent: Constituent = element aug:const { TypeAttribute, # required attribute InlineContent+ # must contain one or more suitable subelements } # A part-of-speech (word or other token): POS = element aug:pos { TypeAttribute, # required attribute NormalAttribute?, SegmentationAttributes?, # optional attributes text # the actual word or token } # A sentence contains one or more suitable subelements: Sentence = element aug:sent { InlineContent+ } # Elements from other namespaces: # Any elements from other namespaces are allowed, as long as they embed the # elements from this schema in the appropriate way. This allows embedding # augmented text in any kind of XML documents. # No other elements are allowed in POS (which contains only a text token). # Outside: can contain any mixed contents and our start element as well as # top-level elements (so the start element is optional for embedded augmented # text). OtherOutsideContent = element * - aug:* { AnyAttributes, ( text | OtherOutsideContent | AugmentedText | TopLevelContent )* } # Any elements from other namespaces are allowed, as long as they embed the # elements from this schema in the appropriate way. Sentences and constituents # cannot directly contain other sentences, but embedded foreign elements can # (e.g., footnotes). OtherContent = element * - aug:* { AnyAttributes, TopLevelContent* } # Other elements can contain any number of attributes AnyAttributes = ( attribute * { text } )* # Attributes (in alphabetic order): # The normalized form of an element (when different from the textual content). # Can contain pipe-separated alternatives, e.g.: # <pos type="PRF" normal="er|es|sie|Sie">sich</pos> NormalAttribute = attribute normal { text } # Compound segmentation (relevant for German texts). # Example: <pos type="NE" normal="Rettungsroboter" # segments="rettung s roboter" normalSegments="rettung roboter" # baseSegment="roboter">Rettungsroboter</pos> # Normalized forms can contain pipe-separated alternatives, e.g.: # <pos type="NN" segments="wettbewerbs aufgaben" # normalSegments="wettbewerb aufgeb|aufgabe" baseSegment="aufgeb|aufgabe" # >Wettbewerbsaufgaben</pos> SegmentationAttributes = { # Whitespace-separated list of segments attribute segments { list { text+ } }, # Whitespace-separated list of the normalized form of segments (when known) attribute normalSegments { list { text+ } }, # The normalized form of the main segment attribute baseSegment { text } } # The type of an element. The value "other" indicates an element that # could not be classified (so no mixed content is required). TypeAttribute = attribute type { xsd:NMTOKEN } # Not part of the schema (preprocessor/language-dependent): Enumerations # of attribute values, e.g. attribute type { "nc" | "vc" | "pc" }.