Skip to main content

Filtering by Number of Sentences

Filtering by Number of Sentences

NLP divides a source text into sentences. The following example filters out sources that contain less than 75 sentences. It returns the total number of sources, then uses the filter to return the total number of sources containing 75 or more sentences, then uses the filter again to return the number of sentences in each of these sources:

DomainCreateOrOpen
  SET dname="mydomain"
  IF (##class(%iKnow.Domain).NameIndexExists(dname))
      { SET domoref=##class(%iKnow.Domain).NameIndexOpen(dname)
        GOTO DeleteOldData }
  ELSE { SET domoref=##class(%iKnow.Domain).%New(dname)
         DO domoref.%Save()
         GOTO SetEnvironment }
DeleteOldData
  SET stat=domoref.DropData()
  IF stat { GOTO SetEnvironment }
  ELSE    { WRITE "DropData error ",$System.Status.DisplayError(stat)
            QUIT}
SetEnvironment
  SET domId=domoref.Id
  IF ##class(%iKnow.Configuration).Exists("myconfig") {
         SET cfg=##class(%iKnow.Configuration).Open("myconfig") }
  ELSE { SET cfg=##class(%iKnow.Configuration).%New("myconfig",0,$LISTBUILD("en"),"",1)
         DO cfg.%Save() }
ListerAndLoader
  SET domId=domoref.Id
  SET flister=##class(%iKnow.Source.SQL.Lister).%New(domId)
  SET myloader=##class(%iKnow.Source.Loader).%New(domId)
QueryBuild
   SET myquery="SELECT TOP 50 ID AS UniqueVal,Type,NarrativeFull FROM Aviation.Event"
   SET idfld="UniqueVal"
   SET grpfld="Type"
   SET dataflds=$LB("NarrativeFull")
UseListerAndLoader
  SET stat=flister.AddListToBatch(myquery,idfld,grpfld,dataflds)
      IF stat '= 1 {WRITE "The lister failed: ",$System.Status.DisplayError(stat) QUIT }
  SET stat=myloader.ProcessBatch()
      IF stat '= 1 {WRITE "The loader failed: ",$System.Status.DisplayError(stat) QUIT }
DefineAFilter
  SET filt=##class(%iKnow.Filters.SentenceCountFilter).%New(domId)
  SET nsent=75
  DO filt.MinSentenceCountSet(nsent)
SourceSentenceQueries
  SET numSrcD=##class(%iKnow.Queries.SourceAPI).GetCountByDomain(domId)
  WRITE "The domain contains ",numSrcD," sources",!
  SET numSrcFD=##class(%iKnow.Queries.SourceAPI).GetCountByDomain(domId,filt)
  WRITE "Of these ",numSrcD," sources ",numSrcFD," contain ",nsent," or more sentences:",!
  DO ##class(%iKnow.Queries.SourceAPI).GetByDomain(.result,domId,1,50,filt)
  SET i=1
  WHILE $DATA(result(i)) {
     SET numSentS = ##class(%iKnow.Queries.SentenceAPI).GetCountBySource(domId,result(i))
     SET intId = $LISTGET(result(i),1)
     SET extId = $LISTGET(result(i),2)
     WRITE "source ",intId," ",extId
     WRITE " has ",numSentS," sentences",!
     SET i=i+1 } 
     WRITE i-1," sources listed"

To filter using both minimum and maximum number of sentences, invoke both instance methods. The following filter selects those sources containing between 10 and 25 sentences (inclusive):

MinMaxFilter
  SET min=10
  SET filt=##class(%iKnow.Filters.SentenceCountFilter).%New(domId)
  DO filt.MinSentenceCountSet(min)
  DO filt.MaxSentenceCountSet(min+15)
FeedbackOpens in a new tab