+ All Categories
Home > Documents > CS544: NER with Weka

CS544: NER with Weka

Date post: 05-Jan-2017
Category:
Upload: dothuan
View: 228 times
Download: 1 times
Share this document with a friend
21
!"#$"%& % !"#$$% '() *+,- ./01 Zornitsa Kozareva USC/ISI Marina del Rey, CA [email protected] www.isi.edu/~kozareva 2134- 567 5898 ()*+, -./01 2+345.6/4. )., 78)996:3)/4. ;,+./<1 *+./4.9 6. 0+=0 )., 38)996<1 0>+* 6.04 ) ?@+,+:.+, 9+0 4< 3)0+54@6+9 4< 6.0+@+90A B+@94. ()*+9A :3;<= >/33? @;AABC >/33? @;AAB D@5).6E)/4.9A @;AAB 4;3C;31D;EC FAG F43)/4.9A H-+; G)0+ )., /*+ +=?@+9964.9A F/A3I13? 5898 -H*)68A J0KLKJ1+M=4;J I+J ),,@+99A ***=IB4=/NI ()*+9 4< ,@K59A C1314/,1J;M ()*+9 4< 9>6?9A OI//E 2133? L6J8645@)?>63 @+<+@+.3+9A M WB-2X:3;<= >/33? @;AABW"B-2X 0)K5>0 7QRSS ,[email protected] WGYZ-XF/A3I13? 5898W"GYZ-XN WB-2X>/33? @;AABW"B-2X U688+, >69 ,)K5>0+@ 6. WFD7XH-+;W"FD7XN WD2[X@;AAB 4;3C;31D;EW"D2[X J4K5>0 WD2[XFAGW"D2[XN %
Transcript

!"#$"%&'

%'

!"#$$%&'()&*+,-&./01&

Zornitsa Kozareva!USC/ISI!

Marina del Rey, [email protected]!

www.isi.edu/~kozareva!

2134-&567&5898&

()*+,'-./01'2+345.6/4.').,'78)996:3)/4.'

•  ;,+./<1' *+./4.9' 6.' 0+=0' ).,' 38)996<1' 0>+*' 6.04' )'

?@+,+:.+,'9+0'4<'3)0+54@6+9'4<'6.0+@+90A'

–  B+@94.'()*+9A':3;<=&>/33?&@;AABC&>/33?&@;AAB&–  D@5).6E)/4.9A'@;AAB&4;3C;31D;EC'FAG&–  F43)/4.9A'H-+;&–  G)0+').,'/*+'+=?@+9964.9A'F/A3I13?&5898&–  -H*)68A'J0KLKJ1+M=4;J&–  I+J'),,@+99A'***=IB4=/NI&–  ()*+9'4<',@K59A'C1314/,1J;M&–  ()*+9'4<'9>6?9A'OI//E&2133?&–  L6J8645@)?>63'@+<+@+.3+9A''–  M'

B@4<N'O+@@1'P4JJ9'0)K5>0'7QRSS',[email protected]'T+J@K)@1'#&%&N'

O+@@1'P4JJ9'U688+,'>69',)K5>0+@'6.'D>64N'

P4JJ9'34@?4@)/4.'J4K5>0'TJVN'

WB-2X:3;<=&>/33?&@;AABW"B-2X'0)K5>0'7QRSS',[email protected]'WGYZ-XF/A3I13?&5898W"GYZ-XN''WB-2X>/33?&@;AABW"B-2X'U688+,'>69',)K5>0+@'6.'WFD7XH-+;W"FD7XN'WD2[X@;AAB&4;3C;31D;EW"D2[X'J4K5>0'WD2[XFAGW"D2[XN'

%'

!"#$"%&'

#'

(-'Q190+*'D\+@\6+]'

#'

F+)@.6.5'Y854@60>*'

!'

Z2Y;('GYZY'

Z-QZ'

GYZY'

').9]+@'

Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'

^.6\+@9601C&C%C%C.K88C^.6C%CH)P''

Z@)6.+,'_)3>6.+'

Z+90'Q+0'T+)0K@+'[+.+@)/4.'

^Q7C&C%C%C%C&C.K88''

!"#$"%&'

!'

F+)@.6.5'Y854@60>*'

S'

Z-QZ'

GYZY'

').9]+@'

Z@)6.+,'_)3>6.+'

Z+90'Q+0'T+)0K@+'[+.+@)/4.'

^Q7C&C%C%C%C&C.K88''

/Q1JCM/& 4M1BB&

B-2QD('

D2[Y(;`YZ;D('

B-2QD('

FD7YZ;D('

D2[Y(;`YZ;D('

FD7YZ;D('

DZP-2'

Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'

^.6\+@9601C&C%C%C.K88C^.6C%CH)P''Z2Y;('GYZY'

P+R/E&

F+)@.6.5'Y854@60>*'

R'

Z2Y;('GYZY'

Z-QZ'

GYZY'

').9]+@'

Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'

^.6\+@9601C&C%C%C.K88C^.6C%CH)P''

Z@)6.+,'_)3>6.+'

Z+90'Q+0'T+)0K@+'[+.+@)/4.'

^Q7C&C%C%C%C&C.K88''

/Q1JCM/& !1C=& +ES+4:/3& +ES+4H3K& +ES+4T;4& ':& 4M1BB&

%' %' %' &' %' B-2QD('

%' &' %' &' &' D2[Y(;`YZ;D('

%' %' &' &' %' B-2QD('

%' &' &' %' %' FD7YZ;D('

%' &' %' &' &' D2[Y(;`YZ;D('

%' %' &' %' %' FD7YZ;D('

&' &' &' &' &' DZP-2'

!"#$"%&'

S'

F+)@.6.5'Y854@60>*'

$'

Z2Y;('GYZY'

Z-QZ'

GYZY'

').9]+@'

Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'

^.6\+@9601C&C%C%C.K88C^.6C%CH)P''

Z@)6.+,'_)3>6.+'

Z+90'Q+0'T+)0K@+'[+.+@)/4.'

^Q7C&C%C%C%C&C.K88''

7>449+')'*)3>6.+'8+)@.6.5'38)996:+@'<@4*'I+U)'

F+)@.6.5'Y854@60>*'

a'

').9]+@'

Z@)6.+,'_)3>6.+'

Z+90'Q+0'T+)0K@+'[+.+@)/4.'

^Q7C&C%C%C%C&C.K88''

/Q1JCM/& 4M1BB&

b'

b'

b'

b'

b'

b'

Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'

^.6\+@9601C&C%C%C.K88C^.6C%CH)P''

P+R/E&

Z2Y;('GYZY'

Z-QZ'

GYZY'

!"#$"%&'

R'

/Q1JCM/& !1C=& +ES+4:/3& +ES+4H3K& +ES+4T;4& ':&

%' %' %' &' %'

&' &' &' %' &'

%' %' &' &' %'

%' &' &' %' %'

%' &' %' %' %'

&' %' &' &' &'

F+)@.6.5'Y854@60>*'

c'

').9]+@'

Z@)6.+,'_)3>6.+'

Z+90'Q+0'T+)0K@+'[+.+@)/4.'

^Q7C&C%C%C%C&C.K88''

Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'

^.6\+@9601C&C%C%C.K88C^.6C%CH)P''Z2Y;('GYZY'

Z-QZ'

GYZY'

/Q1JCM/& !1C=& +ES+4:/3& +ES+4H3K& +ES+4T;4& ':&

%' %' %' &' %'

&' &' &' %' &'

%' %' &' &' %'

%' &' &' %' %'

%' &' %' %' %'

&' %' &' &' &'

F+)@.6.5'Y854@60>*'

d'

').9]+@'

Z@)6.+,'_)3>6.+'Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'

^.6\+@9601C&C%C%C.K88C^.6C%CH)P''Z2Y;('GYZY'

Z-QZ'

GYZY'

Z+90'Q+0'T+)0K@+'[+.+@)/4.'

^Q7C&C%C%C%C&C.K88''

:3/N+4,/N&UEB*/3&

FD7YZ;D('

FD7YZ;D('

B-2QD('

D2[Y(;`YZ;D('

DZP-2'

DZP-2'

V3I/&UEB*/3&

FD7YZ;D('

DZP-2'

B-2QD('

D2[Y(;`YZ;D('

D2[Y(;`YZ;D('

DZP-2'

-\)8K)/4.'

!

Precision =# correct identified NEs

# identified NEs

!"#$"%&'

$'

(-'T+)0K@+'[+.+@)/4.'

%&'

T+)0K@+9'e%f'

•  !;E,/Q,I1M''•  3K@@+.0']4@,'I&'

•  ]4@,9')@4K.,'I&'6.'gH!CMCh!i']6.,4]'

•  :13,W;<WBC//4-&,1K&&e]>+.')\)68)J8+f'

•  H3,-;K31C-+4&XA+E13?&1EN&E;,&JI,I1MM?&/Q4MIB+R/Y&

''!"!#$%&'$() * * * *$%%&'$() * * ******** * *$%%&+!,!-)***./0$"&"1023. * * *'/"-$!")&+/-) * * * *'/"-$!")&45(43"***$'./"50* * * * *%/"3%5&!"!#$% * * * *(1"'-1$#/"&0$.6***)!",%3&'4$. * * * *71"'#/"$%&8/.+9 * * *:;<*

•  .;3NWV?C/&:1Z/3EB%&

****71"'#/"$% * * *******%/83.'$)3+* * * * *=1/-3****'$(!-$%!>3+** ************* *******(1"'-1$#/"*0$.6 * ******* */-43.*

•  T/[&:3/N+4D;EB&•  0>+'0)5'?@+,630+,'6.'0>+'3K@@+.0'38)996:3)/4.'<4@'IH!C'IH#C'IH%'

971"'#/"$%&8/.+*!)*(.3(/)!#/"?*'/"@1"'#/"?*$.#'%3*%%'

!"#$"%&'

a'

T+)0K@+9'e#f'

•  \1KW;<W.;3NB&•  ]4@,9'6.'gHRCMChRi']6.,4]'

•  V3+KK/3&*;3NB]&•  <4@'?+@94.'eA.B?*A!))B?*C.B?*D4CBf'•  <4@'843)/4.'e'!-5?*)-.33-f'•  <4@'4@5).6E)/4.'e<-+BC'E/Bf'

•  P1^/Z//3B&•  .)*+9'4<'36/+9C'34K.0@6+9C'\688)5+9C'90@++09'

•  .)*+9'4<'4@5).6E)/4.9'

•  ?+@94.':@90'.)*+'

•  ?+@94.'9K@.)*+'

%#'j'?K0'+)3>'01?+'4<'0@655+@']4@,9').,'5)E+k++@9'6.'9+?)@)0+':8+9C'J+3)K9+'14K'3).'0@+)0'0>+*')9'9+?)@)0+'<+)0K@+9'

T+)0K@+9'e!f'

•  F+.50>'6.']4@,9'4<'0>+'+./01'J+6.5'38)996:+,'

•  B)k+@.'4<'0>+'+./01']60>'@+5)@,'04'0>+'01?+'4<'34.9/0K0+.0']4@,9'

•  F;3&/14-&4M1BBB'•  ]>48+'(-'69'6.'5)E+k++@'•  ).1'34*?4.+.0'4<'0>+'(-')??+)@9'6.'5)E+k++@'

•  "I_Q/B&e8+.50>'%'04'Sf'

•  B@+\64K9']4@,'69').')@/38+'

•  B@+\64K9']4@,'69')'.4K.'

%!'

!"#$"%&'

c'

7488+3/.5'-=0+@.)8'2+94K@3+9'

%S'

[)E+k++@'7488+3/4.'_+0>4,'%'

•  l)54'34.0)6.9'4\+@'#'*68864.'+.//+9'e86U+'[email protected]'

4@5).6E)/4.9C'36/+9')*4.5'40>+@9f'

•  G4].84),'<@4*A'

'>k?A""]]]N*?6H6.<N*?5N,+"1)54H.)5)"1)54",4].84),9N>0*8'

•  -=0@)30'<@4*'0>+'@+8+\).0'@+8)/4.9')88'.)*+,'+.//+9'

'''''-=N'

–  F*A;3E&+E'GC']>+@+'F'69')'?+@94.').,'G'69')'843)/4.'–  F'*;30B&<;3'GC']>+@+'F'69')'?+@94.').,'G'69')'?+@94.'4@'4@5).6E)/4.'

%R'

!"#$"%&'

d'

[)E+k++@'7488+3/4.'_+0>4,'#'

%$'

B+@94.'

[)E+k++@'7488+3/4.'_+0>4,'#'

%a'

•  Q0+?'%A'7>+3U'6<'6,+./:+,'(-'+=6909'6.'I6U6?+,6)'

•  Q0+?'#A'-=0@)30'0>+':@90'#H!'9+.0+.3+9'

•  Q0+?'!A'BK88'0>+'.4K.9'*)03>6.5'0>+'+=?@+9964.'

''''''''''''''m'69'lC'`'

' ' '''''m'69'l').,'`'

•  Q0+?'SA'-=0@)30'0>+'6.<4@*)/4.'<@4*'0>+'6.<4J4='

•  Q0+?'RA'n+@6<1'6.'I4@,(+0']>+0>+@'0>+'<4K.,'34.3+?09'

' ' ' ')@+'>1?4.1*9'4<'[email protected]'843)/4.C'4@5).6E)/4.'

!"#$"%&'

%&'

[)E+k++@'7488+3/4.'_+0>4,'!'

•  ^9+'Q0).<4@,'()*+,'-./01'2+345.6E+@'

>k?A"".8?N90).<4@,N+,K"94o])@+"72TH(-2N9>0*8'

''''04'6,+./<1'0>+'.)*+,'+.//+9'6.'0>+'3K@@+.0',)0)'9+09N'

•  ^9+'0>+'?@+,630+,'4K0?K0')9'<+)0K@+9'

%c'

B)[email protected]'

%d'

!"#$"%&'

%%'

7)[email protected]'Q6*?8+'B)[email protected]'

•  -=0@)30'?)[email protected]'6.']>63>'0>+'(-9'433K@@+,''''''-=N'

–  O+..1p:()']4@U9pD'<4@pD';L_pH)P'NpD'–  Q)*pB-2']4@U9pD'<4@pD'_63@494opD2['NpD'

–  B)K8p:()'Y,)*9p:()']4@U+,pD'<4@pD'[+4@5+p:()'NpD'

–  O+..1p:()'J4K5>0pD').pD'4@5).5+pD'NpD'–  l)>44qpH)P'J4K5>0pD'D\+@0@K+pH)P'NpD'

•  -=0@)30'\+@J9'04'0>+'8+o').,'04'0>+'@65>0'4<'0>+'(-'-=N'

–  F4.,4.pFD7'+BpD'M;41,/NpD'6.pD'–  O4>.pB-2'N3+E0BpD'rK63+pD'

#&'

I-VY'

I)6U)04'-.\6@4.*+.0'<4@'V.4]8+,5+'Y.)81969'

#%'

!"#$"%&'

%#'

I+U)A'G)0)'_6.6.5'Q4o])@+'

•  7488+3/4.'4<'*)3>6.+'8+)@.6.5')854@60>*9'

–  4?+.H94K@3+'?)3U)5+']@6k+.'6.'O)\)'

•  ^9+,'<4@'@+9+)@3>C'+,K3)/4.').,')??863)/4.'

•  _)6.'<+)0K@+9A'

–  ,)0)'?@+H?@43+996.5'04489'–  8+)@.6.5')854@60>*9'

–  +\)8K)/4.'*+0>4,9'

–  5@)?>63)8'6.<+@+.3+'–  +.\6@4.*+.0'<4@'34*?)@6.5'8+)@.6.5')854@60>*9'

##'

I+U)A'G)0)'_6.6.5'Q4o])@+'

•  78)996:3)/4.')854@60>*9A'

–  ',+36964.'0@++9C'86.+)@'38)996:+@9C'Qn_C'()6\+HJ)1+9C'U(('

•  B@+,63/4.')854@60>*9A'

–  @+5@+9964.'e86.+)@"Qn_f'C'?+@3+?0@4.'

•  _+0)H)854@60>*9A'

–  J)556.5C'J449/.5'eY,)L4490f'

)*4.5'40>+@9'

!"#$"%&'

%!'

[+s.5'Q0)@0+,'

•  ;.90)88'I+U)'94o])@+'e4.'F6.K=fA'

–  G4].84),'86.UA''•  >k?A""?@,4].84),9N94K@3+<4@5+N.+0"]+U)"]+U)H!H$H#NE6?'•  ^.E6?'0>+'94o])@+'

– 2+tK6@+*+.0A'''''O)\)'%NR'e4@'>65>+@f'

–  ;.\4U+'I+U)'34**).,A'

•  r)\)'H3?']+U)Nr)@'H836$&'/00$"+I*

#R'

r)\)'W`JQ98882'Hr)@']+U)Nr)@'Weka GUI Chooser

!"#$"%&'

%S'

@relation english_named_entity

@attribute position numeric @attribute pos_tag { NN, NP, VB, DT} @attribute word_length numeric @attribute in_gazetteer { no, yes} @attribute class { PER, LOC, ORG, MISC}

@data 3,DT,3,no,ORG 4,NP,10,yes,ORG 15,NP,6,yes,PER 7, NN,12,?,MISC ...

G)0)':8+'<4@*)0'eN)@uf

Other attribute types:

•  String

•  Date

Missing value

#$'

List of attributes (last: class variable)

Frequency and categories for the selected

attribute

Statistics about the values of the selected attribute

Classification

Filter selection

Manual attribute selection

Statistical attribute selection

Preprocessing

The Preprocessing Tab

#a'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'

!"#$"%&'

%R'

Choice of classifier

The attribute whose value is to be predicted from the values of the remaining ones.

Default is the last attribute.

Cross-validation: split the data into e.g. 10 folds and

10 times train on 9 folds and test on the remaining one

The Classification Tab

#c'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'

Choosing a classifier

#d'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'

!"#$"%&'

%$'

!&'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'

!%'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'

!"#$"%&'

%a'

all other numbers can be obtained from it

different/easy class

accuracy

!#'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'

Running on Test Set

!!'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'

!"#$"%&'

%c'

I-VY'

74**).,'F6.+'

!S'

I+U)'9?+36:3)/4.9'

•  Z@)6.'38)996:+@'4.'0@)6.6.5',)0)').,'4K0?K0'*4,+8'

•  r)\)'H3?']+U)Nr)@'H'%$))!J3.&71"'#/"I'v0'W-.$!"&J%3I''H,'H-.$!"3+&0/+3%I*

•  2K.'0@)6.+,'38)996:+@'*4,+8'4.'0+90',)0)'•  r)\)'H3?']+U)Nr)@'H'%$))!J3.&71"'#/"I'vZ'W-3)-&J%3I''H8'W-.$!"3+&0/+3%I*

•  Q?+36<16.5'?)@)*+0+@9A'

H0'A'0@)6.6.5':8+'eN)@uf'

HZ'A'0+90':8+'eN)@uf'

H,'A'4K0?K0':8+.)*+'e0@)6.+,'38)996:+@'*4,+8f'

H8'A'6.?K0'*4,+8'e<4@'0+9/.5f'

HV'A'.K*J+@'4<'.+)@+90'.+65>J4@9'<4@'U((')854@60>*'

&4*K*43%(*L'43'6*/1-*/-43.*($.$03-3.*/(#/")?*3-'BM*

5+.+@)8''

?)@)*+0+@9'

78)996:+@H

9?+36:3''

?)@)*+0+@9'

!"#$"%&'

%d'

-=)*?8+A'6(('6.'I+U)'

•  Z@)6.')'38)996:+@'K96.5'#((')854@60>*'

•  r)\)'H3?']+U)Nr)@'''''''''''''''''']+U)N38)996:+@9N8)E1N;LU''

''''''''''''H0'',)0)"]+)0>+@N)@u*

''''''''''''HV''#'

''''''''''''H,''*4,+8N#..'

•  2K.'0>+'0@)6.+,'38)996:+@'4.'0+90',)0)'•  r)\)'H3?']+U)Nr)@'''''''''''''''''']+U)N38)996:+@9N8)E1N;LU''

''''''''''''HZ'',)0)"]+)0>+@N)@u*

''''''''''''H8''*4,+8N#..'

E%$))!J3.&71"'#/"*!"*836$*

N.$!"!",*J%3*O%,/.!-40*($.$03-3.*P1-(1-*0/+3%*"$03*

E%$))!J3.&71"'#/"*!"*836$*

N3)-*J%3*Q"(1-*0/+3%*"$03*

Q)*?8+'I+U)'4K0?K0'

!"#$"%&'

#&'

•  78)996:3)/4.'8)J+89'<4@'+)3>'6.90).3+'eK9+'wv?'%x'4?/4.f'

•  r)\)'H3?']+U)Nr)@''']+U)N38)996:+@9N8)E1N;JU''HZ'',)0)"]+)0>+@N)@u'''H8''*4,+8N#..''H?'%'

_4@+',+0)68+,'4K0?K0'

•  U((A''•  G+36964.'0@++9A'•  ()y\+'L)1+9A'•  Y,)L4490A'''

I+U)'38)996:3)/4.'<K.3/4.9'

!"#$"%&'

#%'

Y,,6/4.)8';.<4@*)/4.'

•  [+.+@)8',43K*+.0)/4.A'

''''>k?A""]]]N39N])6U)04N)3N.E"*8"]+U)"'

>k?A""?@,4].84),9N94K@3+<4@5+N.+0"]+U)"]+U)N??0'

•  74**).,'86.+',43A'

'''>k?A""]+U)N]6U69?)3+9N34*"B@6*+@'


Recommended