NLP performs automatic language identification on a per-sentence basis. When the current configuration has activated automatic language identification, NLP tests each sentence in each source text to determine which of the languages specified in the Configuration is the language used in that sentence. This identification is a statistical probability. This has the following consequences:
NLP subsequently uses this language determination in determining CRCs and other NLP analysis.
Thus, source texts and sentences within a source text can be in different languages. NLP automatically determines which language model to apply. Automatic language identification also assigns a confidence level in its language identification as an integer indicating a percentage. These range from 100 (complete confidence) to 0 (indeterminate). If automatic language identification is not active, all sentences are assigned a confidence level of 0.
Language Identification Queries
Refer to A Note on Program Examples for details on the coding and data used in the examples in this book.
The following example uses GetTopLanguage()Opens in a new tab to identify the language for a source and the degree of confidence in that identification. Because language identification is performed on the sentence level, the language for the source is the result of averaging the language identification confidence for the component sentences. This method returns the language as a two character abbreviation (in this case, “en”). Note that totlangconf (the total of the language confidence for the sentences) must be divided by numlangsent, not by numsent. These two sentence count numbers are usually, but not always, the same. This is because a source may contain sentences for which no language can be determined.
Configuration
SET myconfig="EnFr"
IF ##class(%iKnow.Configuration).Exists(myconfig)
{SET cfg=##class(%iKnow.Configuration).Open(myconfig) }
ELSE {SET cfg=##class(%iKnow.Configuration).%New(myconfig,1,$LISTBUILD("en","fr"),"",1)
DO cfg.%Save() }
SET cfgId=cfg.Id
DomainCreateOrOpen
SET dname="mydomain"
IF (##class(%iKnow.Domain).NameIndexExists(dname))
{ WRITE "The ",dname," domain already exists",!
SET domoref=##class(%iKnow.Domain).NameIndexOpen(dname)
GOTO DeleteOldData }
ELSE
{ WRITE "The ",dname," domain does not exist",!
SET domoref=##class(%iKnow.Domain).%New(dname)
DO domoref.%Save()
WRITE "Created the ",dname," domain with domain ID ",domoref.Id,!
GOTO ListerAndLoader }
DeleteOldData
SET stat=domoref.DropData()
IF stat { WRITE "Deleted the data from the ",dname," domain",!!
GOTO ListerAndLoader }
ELSE { WRITE "DropData error ",$System.Status.DisplayError(stat)
QUIT}
ListerAndLoader
SET domId=domoref.Id
SET flister=##class(%iKnow.Source.SQL.Lister).%New(domId)
SET stat=flister.SetConfig(myconfig)
IF stat '= 1 { WRITE "SetConfig error ",$System.Status.DisplayError(stat)
QUIT }
SET myloader=##class(%iKnow.Source.Loader).%New(domId)
QueryBuild
SET myquery="SELECT Top 10 ID AS UniqueVal,Type,NarrativeFull FROM Aviation.Event"
SET idfld="UniqueVal"
SET grpfld="Type"
SET dataflds=$LB("NarrativeFull")
UseLister
SET stat=flister.AddListToBatch(myquery,idfld,grpfld,dataflds)
IF stat '= 1 {WRITE "The lister failed: ",$System.Status.DisplayError(stat) QUIT }
UseLoader
SET stat=myloader.ProcessBatch()
IF stat '= 1 {WRITE "The loader failed: ",$System.Status.DisplayError(stat) QUIT }
GetSources
DO ##class(%iKnow.Queries.SourceAPI).GetByDomain(.result,domId)
SET i=1
WHILE $DATA(result(i)) {
SET intId = $LISTGET(result(i),1)
SET extId = $LISTGET(result(i),2)
SET numsent = ##class(%iKnow.Queries.SentenceAPI).GetCountBySource(domId,result(i))
WRITE !,extId," has ",numsent," sentences",!
SET srclang = ##class(%iKnow.Queries.SourceAPI).GetTopLanguage(domId,intId,.totlangconf,.numlangsent)
WRITE "Source language is ",srclang,!,"with a confidence % of ",totlangconf/numlangsent,!!
SET i=i+1
}
The following example uses GetLanguage()Opens in a new tab to identify the language for each sentence in a source and the degree of confidence in that identification. This method returns the language as a two character abbreviation (in this case, “en”) and the confidence level as a percentage between 0 and 100. Note that the confidence level is rarely (if ever) 100%.
Configuration
SET myconfig="EnFr"
IF ##class(%iKnow.Configuration).Exists(myconfig)
{SET cfg=##class(%iKnow.Configuration).Open(myconfig) }
ELSE {SET cfg=##class(%iKnow.Configuration).%New(myconfig,1,$LISTBUILD("en","fr"),"",1)
DO cfg.%Save() }
SET cfgId=cfg.Id
DomainCreateOrOpen
SET dname="mydomain"
IF (##class(%iKnow.Domain).NameIndexExists(dname))
{ WRITE "The ",dname," domain already exists",!
SET domoref=##class(%iKnow.Domain).NameIndexOpen(dname)
GOTO DeleteOldData }
ELSE
{ WRITE "The ",dname," domain does not exist",!
SET domoref=##class(%iKnow.Domain).%New(dname)
DO domoref.%Save()
WRITE "Created the ",dname," domain with domain ID ",domoref.Id,!
GOTO ListerAndLoader }
DeleteOldData
SET stat=domoref.DropData()
IF stat { WRITE "Deleted the data from the ",dname," domain",!!
GOTO ListerAndLoader }
ELSE { WRITE "DropData error ",$System.Status.DisplayError(stat)
QUIT}
ListerAndLoader
SET domId=domoref.Id
SET flister=##class(%iKnow.Source.SQL.Lister).%New(domId)
SET stat=flister.SetConfig(myconfig)
IF stat '= 1 { WRITE "SetConfig error ",$System.Status.DisplayError(stat)
QUIT }
SET myloader=##class(%iKnow.Source.Loader).%New(domId)
QueryBuild
SET myquery="SELECT Top 10 ID AS UniqueVal,Type,NarrativeFull FROM Aviation.Event"
SET idfld="UniqueVal"
SET grpfld="Type"
SET dataflds=$LB("NarrativeFull")
UseLister
SET stat=flister.AddListToBatch(myquery,idfld,grpfld,dataflds)
IF stat '= 1 {WRITE "The lister failed: ",$System.Status.DisplayError(stat) QUIT }
UseLoader
SET stat=myloader.ProcessBatch()
IF stat '= 1 {WRITE "The loader failed: ",$System.Status.DisplayError(stat) QUIT }
GetOneSource
DO ##class(%iKnow.Queries.SourceAPI).GetByDomain(.result,domId)
FOR i=1:1:10 {
IF $DATA(result(i)) {
SET intId = $LISTGET(result(i),1)
SET extId = $LISTGET(result(i),2)
SET myconf=0
SET numSentS = ##class(%iKnow.Queries.SentenceAPI).GetCountBySource(domId,result(i))
WRITE !,extId," has ",numSentS," sentences",!
GetSentencesInSource
SET sentStat=##class(%iKnow.Queries.SentenceAPI).GetBySource(.sent,domId,intId)
IF sentStat=1 {
SET i=1
WHILE $DATA(sent(i)) {
SET sentnum=$LISTGET(sent(i),1)
WRITE "sentence:",sentnum
SET lang = ##class(%iKnow.Queries.SentenceAPI).GetLanguage(domId,sentnum,.myconf)
WRITE " language:",lang," confidence:",myconf,!
SET i=i+1
}
}
}
ELSE { WRITE !,"That's all folks!" }
}