match.pro 5.6 KB
Edit Raw Blame History



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161


pro match, a, b, suba, subb, COUNT = count, SORT = sort
;+
; NAME:
;       MATCH
; PURPOSE:
;       Routine to match values in two vectors.
;
; CALLING SEQUENCE:
;       match, a, b, suba, subb, [ COUNT =, /SORT ]
;
; INPUTS:
;       a,b - two vectors to match elements, numeric or string data types
;
; OUTPUTS:
;       suba - subscripts of elements in vector a with a match
;               in vector b
;       subb - subscripts of the positions of the elements in
;               vector b with matchs in vector a.
;
;       suba and subb are ordered such that a[suba] equals b[subb]
;
; OPTIONAL INPUT KEYWORD:
;       /SORT - By default, MATCH uses two different algorithm: (1) the 
;               /REVERSE_INDICES keyword to HISTOGRAM is used for integer data,
;               while a sorting algorithm is used for non-integer data.   The
;               histogram algorithm is usually faster, except when the input
;               vectors are sparse and contain very large numbers, possibly
;               causing memory problems.   Use the /SORT keyword to always use
;               the sort algorithm.
;               
; OPTIONAL KEYWORD OUTPUT:
;       COUNT - set to the number of matches, integer scalar
;
; SIDE EFFECTS:
;       The obsolete system variable !ERR is set to the number of matches;
;       however, the use !ERR is deprecated in favor of the COUNT keyword 
;
; RESTRICTIONS:
;       The vectors a and b should not have duplicate values within them.
;       You can use rem_dup function to remove duplicate values
;       in a vector
;
; EXAMPLE:
;       If a = [3,5,7,9,11]   & b = [5,6,7,8,9,10]
;       then 
;               IDL> match, a, b, suba, subb, COUNT = count
;
;       will give suba = [1,2,3], subb = [0,2,4],  COUNT = 3
;       and       a[suba] = b[subb] = [5,7,9]
;
; 
; METHOD:
;       For non-integer data types, the two input vectors are combined and
;       sorted and the consecutive equal elements are identified.   For integer
;       data types, the /REVERSE_INDICES keyword to HISTOGRAM of each array
;       is used to identify where the two arrays have elements in common.   
; HISTORY:
;       D. Lindler  Mar. 1986.
;       Fixed "indgen" call for very large arrays   W. Landsman  Sep 1991
;       Added COUNT keyword    W. Landsman   Sep. 1992
;       Fixed case where single element array supplied   W. Landsman Aug 95
;       Converted to IDL V5.0   W. Landsman   September 1997
;       Use a HISTOGRAM algorithm for integer vector inputs for improved 
;             performance                W. Landsman         March 2000
;       Work again for strings           W. Landsman         April 2000
;       Use size(/type)                  W. Landsman         December 2002
;       Work for scalar integer input    W. Landsman         June 2003
;       Assume since V5.4, use COMPLEMENT to WHERE() W. Landsman Apr 2006
;-
;-------------------------------------------------------------------------
 On_error,2
 compile_opt idl2

 if N_params() LT 3 then begin
     print,'Syntax - match, a, b, suba, subb, [ COUNT = ]'
     print,'    a,b -- input vectors for which to match elements'
     print,'    suba,subb -- output subscript vectors of matched elements'
     return
 endif

 da = size(a,/type) & db =size(b,/type)
 if keyword_set(sort) then hist = 0b else $
 hist = (( da LE 3 ) or (da GE 12)) and  ((db LE 3) or (db GE 12 )) 

 if not hist then begin           ;Non-integer calculation
 
 na = N_elements(a)              ;number of elements in a
 nb = N_elements(b)             ;number of elements in b

; Check for a single element array

 if (na EQ 1) or (nb EQ 1) then begin
        if (nb GT 1) then begin
                subb = where(b EQ a[0], nw)
                if (nw GT 0) then suba = replicate(0,nw) else suba = [-1]
        endif else begin
                suba = where(a EQ b[0], nw)
                if (nw GT 0) then subb = replicate(0,nw) else subb = [-1]
        endelse
        count = nw
        return
 endif
        
 c = [ a, b ]                   ;combined list of a and b
 ind = [ lindgen(na), lindgen(nb) ]       ;combined list of indices
 vec = [ bytarr(na), replicate(1b,nb) ]  ;flag of which vector in  combined 
                                         ;list   0 - a   1 - b

; sort combined list

 sub = sort(c)
 c = c[sub]
 ind = ind[sub]
 vec = vec[sub]

; find duplicates in sorted combined list

 n = na + nb                            ;total elements in c
 firstdup = where( (c EQ shift(c,-1)) and (vec NE shift(vec,-1)), Count )

 if Count EQ 0 then begin               ;any found?
        suba = lonarr(1)-1
        subb = lonarr(1)-1
        return
 end
 
 dup = lonarr( Count*2 )                     ;both duplicate values
 even = lindgen( N_elements(firstdup))*2     ;Changed to LINDGEN 6-Sep-1991
 dup[even] = firstdup
 dup[even+1] = firstdup+1
 ind = ind[dup]                         ;indices of duplicates
 vec = vec[dup]                         ;vector id of duplicates
 subb = ind[ where( vec, complement = vzero) ]             ;b subscripts
 suba = ind[ vzero] 
  
 endif else begin             ;Integer calculation using histogram.

 minab = min(a, MAX=maxa) > min(b, MAX=maxb) ;Only need intersection of ranges
 maxab = maxa < maxb

;If either set is empty, or their ranges don't intersect: 
;  result = NULL (which is denoted by integer = -1)
  !ERR = -1
  suba = -1
  subb = -1
  COUNT = 0L
 if (maxab lt minab) or (maxab lt 0) then return
 
 ha = histogram([a], MIN=minab, MAX=maxab, reverse_indices=reva)
 hb = histogram([b], MIN=minab, MAX=maxab, reverse_indices=revb)
 
 r = where((ha ne 0) and (hb ne 0), count)
 if count gt 0 then begin
  suba = reva[reva[r]]
  subb = revb[revb[r]]
 endif 
 endelse 

 return
 
 end