cmset_op.pro 16.2 KB
Edit Raw Blame History



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432


;+
; NAME:
;   CMSET_OP
;
; AUTHOR:
;   Craig B. Markwardt, NASA/GSFC Code 662, Greenbelt, MD 20770
;   craigm@lheamail.gsfc.nasa.gov
;
; PURPOSE:
;   Performs an AND, OR, or XOR operation between two sets
;
; CALLING SEQUENCE:
;   SET      = CMSET_OP(A, OP, B)
;
; DESCRIPTION: 
;
;   SET_OP performs three common operations between two sets.  The
;   three supported functions of OP are:
;
;        OP          Meaning
;      'AND' - to find the intersection of A and B;
;      'OR'  - to find the union of A and B;
;      'XOR' - to find the those elements who are members of A or B 
;              but not both;
;
;   Sets as defined here are one dimensional arrays composed of
;   numeric or string types.  Comparisons of equality between elements
;   are done using the IDL EQ operator.
;
;   The complements of either set can be taken as well, by using the
;   NOT1 and NOT2 keywords.  For example, it may be desireable to find
;   the elements in A but not B, or B but not A (they are different!).
;   The following IDL expressions achieve each of those effects:
;
;      SET = CMSET_OP(A, 'AND', /NOT2, B)   ; A but not B
;      SET = CMSET_OP(/NOT1, A, 'AND', B)   ; B but not A
;
;   Note the distinction between NOT1 and NOT2.  NOT1 refers to the
;   first set (A) and NOT2 refers to the second (B).  Their ordered
;   placement in the calling sequence is entirely optional, but the
;   above ordering makes the logical meaning explicit.
;
;   NOT1 and NOT2 can only be set for the 'AND' operator, and never
;   simultaneously.  This is because the results of an operation with
;   'OR' or 'XOR' and any combination of NOTs -- or with 'AND' and
;   both NOTs -- formally cannot produce a defined result.
;
;   The implementation depends on the type of operands.  For integer
;   types, a fast technique using HISTOGRAM is used.  However, this
;   algorithm becomes inefficient when the dynamic range in the data
;   is large.  For those cases, and for other data types, a technique
;   based on SORT() is used.  Thus the compute time should scale
;   roughly as (A+B)*ALOG(A+B) or better, rather than (A*B) for the
;   brute force approach.  For large arrays this is a significant
;   benefit.
;
; INPUTS:
;
;   A, B - the two sets to be operated on.  A one dimensional array of
;          either numeric or string type.  A and B must be of the same
;          type.  Empty sets are permitted, and are either represented
;          as an undefined variable, or by setting EMPTY1 or EMPTY2.
;
;   OP - a string, the operation to be performed.  Must be one of
;        'AND', 'OR' or 'XOR' (lower or mixed case is permitted).
;        Other operations will cause an error message to be produced.
;
; KEYWORDS:
;
;   NOT1, NOT2 - if set and OP is 'AND', then the complement of A (for
;                NOT1) or B (for NOT2) will be used in the operation.
;                NOT1 and NOT2 cannot be set simultaneously.
;
;   EMPTY1, EMPTY2 - if set, then A (for EMPTY1) or B (for EMPTY2) are
;                    assumed to be the empty set.  The actual values
;                    passed as A or B are then ignored.
;
;   INDEX - if set, then return a list of indices instead of the array
;           values themselves.  The "slower" set operations are always
;           performed in this case.
;
;           The indices refer to the *combined* array [A,B].  To
;           clarify, in the following call: I = CMSET_OP(..., /INDEX);
;           returned values from 0 to NA-1 refer to A[I], and values
;           from NA to NA+NB-1 refer to B[I-NA].
;
;   COUNT - upon return, the number of elements in the result set.
;           This is only important when the result set is the empty
;           set, in which case COUNT is set to zero.
;
; RETURNS:
;
;   The resulting set as a one-dimensional array.  The set may be
;   represented by either an array of data values (default), or an
;   array of indices (if INDEX is set).  Duplicate elements, if any,
;   are removed, and element order may not be preserved.
;
;   The empty set is represented as a return value of -1L, and COUNT
;   is set to zero.  Note that the only way to recognize the empty set
;   is to examine COUNT.
;
; SEE ALSO:
;
;   SET_UTILS.PRO by RSI
;
; MODIFICATION HISTORY:
;   Written, CM, 23 Feb 2000
;   Added empty set capability, CM, 25 Feb 2000
;   Documentation clarification, CM 02 Mar 2000
;   Incompatible but more consistent reworking of EMPTY keywords, CM,
;     04 Mar 2000
;   Minor documentation clarifications, CM, 26 Mar 2000
;   Corrected bug in empty_arg special case, CM 06 Apr 2000
;   Add INDEX keyword, CM 31 Jul 2000
;   Clarify INDEX keyword documentation, CM 06 Sep 2000
;   Made INDEX keyword always force SLOW_SET_OP, CM 06 Sep 2000
;   Added CMSET_OP_UNIQ, and ability to select FIRST_UNIQUE or
;     LAST_UNIQUE values, CM, 18 Sep 2000
;   Removed FIRST_UNIQUE and LAST_UNIQUE, and streamlined
;     CMSET_OP_UNIQ until problems with SORT can be understood, CM, 20
;     Sep 2000 (thanks to Ben Tupper)
;   Still trying to get documentation of INDEX and NOT right, CM, 28
;     Sep 2000 (no code changes)
;   Correct bug for AND case, when input sets A and B each only have
;     one unique value, and the values are equal.  CM, 04 Mar 2004
;     (thanks to James B. jbattat at cfa dot harvard dot edu)
;   Add support for the cases where the input data types are mixed,
;      but still compatible; also, attempt to return the same data
;      type that was passed in; CM, 05 Feb 2005
;   Fix bug in type checking (thanks to "marit"), CM, 10 Dec 2005
;   Work around a stupidity in the built-in IDL HISTOGRAM routine,
;      which tries to "help" you by restricting the MIN/MAX to the
;      range of the input variable (thanks to Will Maddox), CM, 16 Jan 2006
;
;  $Id: cmset_op.pro,v 1.6 2006/01/16 19:45:22 craigm Exp $
;
;-
; Copyright (C) 2000, 2004, 2005, 2006, Craig Markwardt
; This software is provided as is without any warranty whatsoever.
; Permission to use, copy, modify, and distribute modified or
; unmodified copies is granted, provided this copyright and disclaimer
; are included unchanged.
;-

;; Utility function, similar to UNIQ, but allowing choice of taking
;; first or last unique element, or non-unique elements.
;; Unfortunately this doesn't work because of implementation dependent
;; versions of the SORT() function.

; function cmset_op_uniq, a, first=first, non=non, count=ct, sort=sortit
;   if n_elements(a) LE 1 then return, 0L
;   sh = (2L*keyword_set(first)-1L)*(-2L*keyword_set(non)+1)
;
;   if keyword_set(sortit) then begin
;       ;; Sort it manually
;       ii = sort(a) & b = a(ii)
;       if keyword_set(non) then wh = where(b EQ shift(b, sh), ct) $
;       else                     wh = where(b NE shift(b, sh), ct)
;       if ct GT 0 then return, ii(wh)
;   endif else begin
;       ;; Use the user's values directly
;       if keyword_set(non) then wh = where(a EQ shift(a, sh), ct) $
;       else                     wh = where(a NE shift(a, sh), ct)
;       if ct GT 0 then return, wh
;   endelse
;
;   if keyword_set(first) then return, 0L else return, n_elements(a)-1
; end

;; Simplified version of CMSET_OP_UNIQ which sorts, and takes the
;; "first" value, whatever that may mean.
function cmset_op_uniq, a
  if n_elements(a) LE 1 then return, 0L

  ii = sort(a) & b = a(ii)
  wh = where(b NE shift(b, +1L), ct)
  if ct GT 0 then return, ii(wh)

  return, 0L
end

function cmset_op, a, op0, b, not1=not1, not2=not2, count=count, $
              empty1=empty1, empty2=empty2, maxarray=ma, index=index

  on_error, 2 ;; return on error
  count = 0L
  index0 = -1L
  ;; Histogram technique is used for array sizes < 32,000 elements
  if n_elements(ma) EQ 0 then ma = 32L*1024L

  ;; Check the number of arguments
  if n_params() LT 3 then begin
      ARG_ERR:
      message, 'USAGE: SET = CMSET_OP(A, OP, B [, COUNT=ct])', /info
      message, '  KEYWORDS: /NOT1, /NOT2, /EMPTY1, /EMPTY2, INDEX', /info
      return, -1L
  endif
  if n_elements(op0) EQ 0 then goto, ARG_ERR
  kind = keyword_set(index)
  fst = 1L
  if keyword_set(last) then fst = 0L
  if keyword_set(first) then fst = 1L

  ;; Check the operation
  sz = size(op0)
  if sz(sz(0)+1) NE 7 then begin
      OP_ERR:
      message, "ERROR: OP must be 'AND', 'OR' or 'XOR'"
      return, -1L
  endif
  op = strupcase(op0)
  if op NE 'AND' AND op NE 'OR' AND op NE 'XOR' then goto, OP_ERR

  ;; Check NOT1 and NOT2
  if keyword_set(not1) AND keyword_set(not2) then begin
      message, "ERROR: NOT1 and NOT2 cannot be set simultaneously"
      return, -1L
  endif
  if (keyword_set(not1) OR keyword_set(not2)) AND $
    (op EQ 'OR' OR op EQ 'XOR') then begin
      message, "ERROR: NOT1 and NOT2 cannot be set with 'OR' or 'XOR'"
      return, -1L
  endif

  ;; Special cases for empty set
  n1 = n_elements(a) & n2 = n_elements(b)
  if keyword_set(empty1) then n1 = 0L
  if keyword_set(empty2) then n2 = 0L
  if n1 EQ 0 OR n2 EQ 0 then begin
      ;; Eliminate duplicates
      if n1 GT 0 then a1 = cmset_op_uniq(a)
      if n2 GT 0 then b1 = cmset_op_uniq(b)
      n1 = n_elements(a1) < n1 & n2 = n_elements(b1) < n2
      case op of 
          'OR': if n1 EQ 0 then goto, RET_A1 else goto, RET_B1
         'XOR': if n1 EQ 0 then goto, RET_B1 else goto, RET_A1
         'AND': begin
             if keyword_set(not1) AND n1 EQ 0 then goto, RET_B1
             if keyword_set(not2) AND n2 EQ 0 then goto, RET_A1
             return, -1L
         end
     endcase
     return, -1L
     RET_A1:
     count = n1
     if kind then begin
         if count GT 0 then return, a1 else return, -1L
     endif
     if count GT 0 then return, a(a1) else return, -1L
     RET_B1:
     count = n2
     if kind then begin
         if count GT 0 then return, b1+n1 else return, -1L
     endif         
     if count GT 0 then return, b(b1) else return, -1L
 endif

  ;; Allow data to have different types, but they must be at least of
  ;; the same "base" type.  That is, you can't combine a number with a
  ;; string, etc.
  ;; basetype 0:undefined 1:real number 6:complex number 7:string
  ;;     8:structure 10:pointer 11:object

  ;;          0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
  basetype = [0, 1, 1, 1, 1, 1, 6, 7, 8, 6,10,11, 1, 1, 1, 1]

  ;; Check types of operands
  sz1 = size(a) & tp1 = sz1(sz1(0)+1)
  sz2 = size(b) & tp2 = sz2(sz2(0)+1)
  if tp1 LT 0 OR tp1 GE 16 OR tp2 LT 0 OR tp2 GE 16 then begin
      message, 'ERROR: unrecognized data types for operands'
      return, -1
  endif
  if basetype(tp1) NE basetype(tp2) then begin
      TYPE1_ERR:
      message, 'ERROR: both A and B must be of the same type'
      return, -1L
  endif
  if tp1 EQ 8 OR tp1 EQ 10 OR tp1 EQ 11 then begin
      TYPE2_ERR:
      message, 'ERROR: operands must be a numeric or string type'
      return, -1L
  endif

  ;; Now use two different kinds of algorithms: a slower but more
  ;; general algorithm for generic types, and the histogram technique
  ;; for integer types.  Even for integer types, if there is too much
  ;; dynamic range, then the slow method is used.

  if tp1 GE 4 AND tp1 LE 9 then begin
      ;; String and real types, or large int arrays
      SLOW_SET_OP:
      case op of 
          'OR': begin
              uu = [a,b]    ;; OR is simple; just take unique values
              index0 = cmset_op_uniq(uu)
              count = n_elements(index0)
              if kind then return, index0
              return, uu(index0)
          end

          'XOR': begin
              ;; Make ordered list of set union
              ai = cmset_op_uniq(a) & na = n_elements(ai)
              bi = cmset_op_uniq(b) & nb = n_elements(bi)
              ui = [ai, bi+n1]
              uu = [a,b]    & uu = uu(ui) ;; Raw union...
              us = sort(uu) & uu = uu(us) ;; ...and sort
              if kind then ui = ui(temporary(us)) else ui = 0

              ;; Values in one set only will not have duplicates
              wh1 = where(uu NE shift(uu, -1), count1)
              if count1 EQ 0 then return, -1L
              wh = where(wh1(1:*)-wh1 EQ 1, count)
              if wh1(0) EQ 0 then begin
                  if count GT 0 then wh = [-1L, wh] else wh = [-1L]
                  count = n_elements(wh)
              endif
              if count EQ 0 then return, -1
              if kind then return, ui(wh1(wh+1))
              return, uu(wh1(wh+1))
          end

          'AND': begin
              ;; Make ordered list of set union
              ai = cmset_op_uniq(a) & na = n_elements(ai)
              bi = cmset_op_uniq(b) & nb = n_elements(bi)
              ui = [ai, bi+n1]
              uu = [a,b]    & uu = uu(ui)  ;; Raw union...
              us = sort(uu) & uu = uu(us)  ;; ...and sort
              if kind then ui = ui(us) else ui = 0

              if NOT keyword_set(not1) AND NOT keyword_set(not2) then begin

                  ;; Special case: if there are one in each set, and
                  ;; they are equal, then the SHIFT() technique below
                  ;; fails.  Do this one by hand.
                  if na EQ 1 AND nb EQ 1 AND uu(0) EQ uu(1) then begin
                      count = 1L
                      if kind then return, 0L
                      return, [uu(0)]
                  endif

                  ;; If neither "NOT" is set, then find duplicates
                  us = 0L  ;; Save memory
                  wh = where(uu EQ shift(uu, -1L), count) ;; Find non-unique
                  if count EQ 0 then return, -1L
                  ;; This should always select the element from A
                  ;; rather than B (the smaller of the two)
                  if kind then return, (ui(wh) < ui(wh+1))
                  return, uu(wh)
              endif

              ;; For "NOT" cases, we need to identify by set
              ii = make_array(na+nb, value=1b)
              if keyword_set(not1) then ii(0:na-1) = 0
              if keyword_set(not2) then ii(na:*)   = 0
              ii = ii(temporary(us))

              ;; Remove any duplicates
              wh1 = where(uu EQ shift(uu, -1L), count1) ;; Find non-unique
              if count1 GT 0 then ii([wh1, wh1+1]) = 0
              ;; Remainder is the desired set
              wh = where(ii, count)
              if count EQ 0 then return, -1L
              if kind then return, ui(wh)
              return, uu(wh)
          end

      endcase
      return, -1L  ;; DEFAULT CASE
  endif else begin

      ;; INDEX keyword forces the "slow" operation
      if kind then goto, SLOW_SET_OP

      ;; Integer types - use histogram technique if the data range
      ;; is small enough, otherwise use the "slow" technique above
      min1 = min(a, max=max1) & min2 = min(b, max=max2)
      minn = min1 < min2 & maxx = max1 > max2
      nbins = maxx-minn+1
      if (maxx-minn) GT floor(ma(0)) then goto, SLOW_SET_OP

      ;; Work around a stupidity in the built-in IDL HISTOGRAM routine
      if (tp1 EQ 2 OR tp2 EQ 2) AND (minn LT -32768 OR maxx GT 32767) then $
        goto, SLOW_SET_OP

      ;; Following operations create a histogram of the integer values.
      ha = histogram(a, min=minn, max=maxx) < 1
      hb = histogram(b, min=minn, max=maxx) < 1

      ;; Compute NOT cases
      if keyword_set(not1) then ha = 1b - ha  
      if keyword_set(not2) then hb = 1b - hb
      case op of 
          ;; Boolean operations
          'AND': mask = temporary(ha) AND temporary(hb) 
           'OR': mask = temporary(ha)  OR temporary(hb)
          'XOR': mask = temporary(ha) XOR temporary(hb)
      endcase

      wh = where(temporary(mask), count)
      if count EQ 0 then return, -1L
      
      result = temporary(wh+minn)
      if tp1 NE tp2 then return, result
      szr = size(result) & tpr = szr(szr(0)+1)

      ;; Cast to the original type if necessary
      if tpr NE tp1 then begin
          fresult = make_array(n_elements(result), type=tp1)
          fresult(0) = temporary(result)
          result = temporary(fresult)
      endif

      return, result

  endelse

  return, -1L  ;; DEFAULT CASE
end

;     Here is how I did the INDEX stuff with fast histogramming.  It
;     works, but is complicated, so I forced it to go to SLOW_SET_OP.
;     ha = histogram(a, min=minn, max=maxx, reverse=ra) < 1
;     rr = ra(0:nbins) & mask = rr NE rr(1:*) & ra = ra(rr)*mask-1L+mask
;     hb = histogram(b, min=minn, max=maxx, reverse=rb) < 1
;     rr = rb(0:nbins) & mask = rr NE rr(1:*) & rb = rb(rr)*mask-1L+mask
;     ...  AND/OR/XOR NOT masking here ...
;     ra = ra(wh) & rb = rb(wh)
;     return, ra*(ra GE 0) + (rb+n1)*(ra LT 0) ;; is last 'ra' right?